Hier meine Funktion:

#include "avr/io.h"

;------------------------------------------------------------------------------
; Funktion : Unsigned Integer Multiplikation
; Taktzyklen: 24
; Register : r0 r1 r18 r19 r20 r21 r22 r23 r24 r25 r30
;
; 32Bit = 16Bi * 16Bit
; CH:CMH:CML:CL = AH:AL * BH:BL
;------------------------------------------------------------------------------
; Unsigned 16Bit Faktor A
#define AL r24
#define AH r25

; Unsigned 16Bit Faktor B
#define BL r22
#define BH r23

; Unsigned 32Bit Produkt C
#define CL r22
#define CML r23
#define CMH r24
#define CH r25

; 0 Register
#define ZERO r30

; Unsigned 16Bit Produkt aus AL * BL
#define ALBLL r18
#define ALBLH r19

; Unsigned 16bit Produkt aus AH * BH
#define AHBHL r20
#define AHBHH r21

; Unsigned Integer Multiplikation
.global us32Mul16_16_32
.func us32Mul16_16_32
us32Mul16_16_32:
clr ZERO ; r30 = 0 1
mul AH, BH ; AH * BH 2
movw AHBHL, r0 ; (AH * BH) -> (AHBHL:AHBHH) 1
mul AL , BL ; AL * BL 2
movw ALBLL, r0 ; (AL * BL) -> (AHBHL:AHBHH) 1
mul AH, BL ; AH * BL 2
add ALBLH, r0 ; ALBLH + (AH * BL)l 1
adc AHBHL, r1 ; AHBHL + (AH * BL)h 1
adc AHBHH, ZERO; AHBHH + Carry 1
mul BH, AL ; BH * AL 2
add ALBLH, r0 ; ALBLH + (BH * AL)l 1
adc AHBHL, r1 ; AHBHL + (BH * AL)h 1
adc AHBHH, ZERO; AHBHH + Carry 1
movw CL , ALBLL ; ALBLL -> CL 1
movw CMH, AHBHL; AHBHL -> CMH 1
clr r1 ; r1 = 0 1
ret ; return 4
.endfunc
;------------------------------------------------------------------------------

__umulhisi3 und __mulhisi3 aus der libgcc fand ich von der Laufzeit
nicht so berauschend.