;***************************************************************************
; unit:    raster      release 0.37                                        *
; purpose: general manipulation n dimensional matrices n = 1, 2 and 3.     *
;          Use this file or rasterc.c. You cannot link both files together *
; licency:     GPL or LGPL                                                 *
; Copyright: (c) 1998-2025 Jaroslav Fojtik                                 *
;***************************************************************************

.586              ;Target processor.  Use instructions for Pentium class machines
.MMX
.MODEL FLAT, C    ;Use the flat memory model. Use C calling conventions

.CODE             ;Indicates the start of a code segment.


; https://www.plantation-productions.com/Webster/www.artofasm.com/Linux/HTML/TheMMXInstructionSeta2.html
; https://docs.oracle.com/cd/E19253-01/817-5477/eojdc/index.html


;void Conv4_8_MMX(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_8_MMX
Conv4_8_MMX proc \
        uses edi esi, \
        Dest:ptr byte, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di = destination pointer (es=segment part)
        mov     esi,[Src]      ; di = source pointer

        sub	ecx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	mm4,eax
	mov	eax, 0F0F0F0Fh
	movd	mm3,eax
PIXEL8:	movd	mm0,dword ptr[esi]	; 87654321
	movq	mm1,mm0
	add	esi,4
	pand	mm0,mm3			; -7-5-3-1
	movq	mm2,mm0
	psllw	mm2,4			; 7-5-3-1-	no operation for bytes - never mind
	por	mm0,mm2			; 77553311

	pand	mm1,mm4			; 8-6-4-2-
	movq	mm2,mm1
	psrlw	mm2,4			; -8-6-4-2
	por	mm1,mm2			; 88664422
	
	punpcklbw mm1,mm0
	movq	qword ptr[edi],mm1	; 8877665544332211
	add	edi,8
	sub	ecx,8
	jae	PIXEL8
	emms	

PIXEL1:	add	ecx,8
	jz	ToEnd			; array has zero size or all done        
        cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax		; 21 21
	rol	ax,4		; 12 12
	and	dx,00FF0h	;  2 1
	and	ax,0F00Fh	; 2   1
	or	ax,dx
	sub	ecx,2
	jb	ToEndStor1
	stosw
	jnz	PIXEL

ToEnd:
        ret                     ; _cdecl return
        
ToEndStor1:			; one remaining byte needs to be stored
	stosb
	ret        
                
Conv4_8_MMX endp


;*************************************************************************************


;void Conv4_16_MMX(WORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_16_MMX
Conv4_16_MMX proc \
        uses edi esi, \
        Dest:ptr word, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di = destination pointer (es=segment part)
        mov     esi,[Src]      ; di = source pointer

        sub	ecx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	mm4,eax
	mov	eax, 0F0F0F0Fh
	movd	mm3,eax
PIXEL8:	movd	mm0,dword ptr[esi]	; 87654321
	movq	mm1,mm0
	add	esi,4
	pand	mm0,mm3			; -7-5-3-1
	movq	mm2,mm0
	psllw	mm2,4			; 7-5-3-1-	no operation for bytes - never mind use words
	por	mm0,mm2			; 77553311

	pand	mm1,mm4			; 8-6-4-2-
	movq	mm2,mm1
	psrlw	mm2,4			; -8-6-4-2
	por	mm1,mm2			; 88664422
	
	punpcklbw mm1,mm0		; 8877665544332211
	movq	mm0,mm1
	punpcklbw mm0,mm0
	movq	qword ptr[edi],mm0	; 4444333322221111
	punpckhbw mm1,mm1
	movq	qword ptr[edi+8],mm1	; 8888777766665555
	add	edi,16
	sub	ecx,8
	jae	PIXEL8
	emms	

PIXEL1:	add	ecx,8
	jz	toend			; array has zero size or all done        
        cld        
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	sub	ecx,2
	jb	ToEndStor1	; only 1 pixel is remaining
	stosd
	jnz	PIXEL

ToEnd:
        ret                     ; _cdecl return
        
ToEndStor1:
	stosw
        ret                     ; _cdecl return        
                
Conv4_16_MMX endp


;*************************************************************************************


;void Conv4_32_MMX(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_32_MMX
Conv4_32_MMX proc \
        uses edi esi, \
        Dest:ptr dword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di = destination pointer (es=segment part)
        mov     esi,[Src]      ; di = source pointer

	sub	ecx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	mm4,eax
	mov	eax, 0F0F0F0Fh
	movd	mm3,eax
PIXEL8:	movd	mm0,dword ptr[esi]	; 87654321
	movq	mm1,mm0
	add	esi,4
	pand	mm0,mm3			; -7-5-3-1
	movq	mm2,mm0
	psllw	mm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	mm0,mm2			; 77553311

	pand	mm1,mm4			; 8-6-4-2-
	movq	mm2,mm1
	psrlw	mm2,4			; -8-6-4-2
	por	mm1,mm2			; 88664422
	
	punpcklbw mm1,mm0		; 8877665544332211
	movq	mm0,mm1
	punpcklbw mm0,mm0		; 4444333322221111
	movq	mm2,mm0
	punpcklwd mm2,mm2
	movq	qword ptr[edi],mm2	; 2222222211111111
	punpckhwd mm0,mm0
	movq	qword ptr[edi+8],mm0	; 4444444433333333	
	punpckhbw mm1,mm1
	movq	mm2,mm1
	punpcklwd mm2,mm2
	movq	qword ptr[edi+16],mm2	;6666666655555555
	punpckhwd mm1,mm1
	movq	qword ptr[edi+24],mm1	;8888888877777777
	add	edi,32
	sub	ecx,8
	jae	PIXEL8
	emms	

PIXEL1:	add	ecx,8
	jz	toend			; array has zero size or all done        
        cld        
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	mov	edx,eax
	rol	eax,16
	xchg	ax,dx	
	stosd
	
	mov	eax,edx		; 2nd pixel		
	sub	ecx,2
	jb	ToEnd
	stosd			; prezerves ZF
	jnz	PIXEL

ToEnd:
        ret                     ; _cdecl return
                
Conv4_32_MMX endp


;*************************************************************************************


;void Conv4_64_MMX(uint64_t *Dest, const uint8_t *Src, unsigned Size1D)
        public  Conv4_64_MMX
Conv4_64_MMX proc \
        uses edi esi, \
        Dest:ptr dword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di = destination pointer (es=segment part)
        mov     esi,[Src]      ; di = source pointer

	sub	ecx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	mm4,eax
	mov	eax, 0F0F0F0Fh
	movd	mm3,eax
PIXEL8:	movd	mm0,dword ptr[esi]	; 87654321
	movq	mm1,mm0
	add	esi,4
	pand	mm0,mm3			; -7-5-3-1
	movq	mm2,mm0
	psllw	mm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	mm0,mm2			; 77553311

	pand	mm1,mm4			; 8-6-4-2-
	movq	mm2,mm1
	psrlw	mm2,4			; -8-6-4-2
	por	mm1,mm2			; 88664422
	
	punpcklbw mm1,mm0		; 8877665544332211
	movq	mm0,mm1
	punpcklbw mm0,mm0		; 4444333322221111
	movq	mm2,mm0
	punpcklwd mm2,mm2		; 2222222211111111
	movq	mm5,mm2
	punpckldq mm5,mm5
	movq	qword ptr[edi],mm5	; 1111111111111111
	punpckhdq mm2,mm2
	movq	qword ptr[edi+8],mm2	; 2222222222222222
	punpckhwd mm0,mm0		; 4444444433333333	
	movq	mm5,mm0
	punpckldq mm5,mm5
	movq	qword ptr[edi+16],mm5	; 3333333333333333
	punpckhdq mm0,mm0
	movq	qword ptr[edi+24],mm0	; 4444444444444444
	punpckhbw mm1,mm1
	movq	mm2,mm1
	punpcklwd mm2,mm2
	movq	mm5,mm2
	punpckldq mm5,mm5
	movq	qword ptr[edi+32],mm5	; 5555555555555555
	punpckhdq mm2,mm2
	movq	qword ptr[edi+40],mm2	; 6666666666666666
	punpckhwd mm1,mm1
	movq	mm5,mm1
	punpckldq mm5,mm5
	movq	qword ptr[edi+48],mm5	; 7777777777777777
	punpckhdq mm1,mm1
	movq	qword ptr[edi+56],mm1	; 8888888888888888
	add	edi,64
	sub	ecx,8
	jae	PIXEL8
	emms	

PIXEL1:	add	ecx,8
	jz	toend			; array has zero size or all done        
        cld
        
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	mov	edx,eax
	rol	eax,16
	xchg	ax,dx	
	stosd
	stosd
	
	mov	eax,edx		; 2nd pixel		
	sub	ecx,2
	jb	ToEnd
	stosd			; prezerves ZF
	stosd
	jnz	PIXEL        

ToEnd:	ret			; _cdecl return
                
Conv4_64_MMX endp


;*************************************************************************************


        public  Conv8_4_MMX
Conv8_4_MMX proc \
        uses edi esi, \
        Dest:ptr byte, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
        
       	sub	ecx,8
        jl	PIXEL1
        
        mov	eax,00F000F0h
        movd	mm3,eax
        punpckldq mm3,mm3
PIXEL8: movq	mm0,qword ptr [esi]	; 3h 3l|2h 2l|1h 1l|0h 0l
	movq	mm1,mm0
	add	esi,8
	pand	mm0,mm3			;  -  -  2h -| -  -  0h - 
	psrlw	mm1,12			;  -  -  - 3h| -  -  - 1h
	por	mm0,mm1			;  -  -  2h3h| -  -  0h1h
	packuswb mm0,mm0
	movd	dword ptr [edi],mm0
	add	edi,4
	sub	ecx,8
        jae	PIXEL8
	emms	
        
PIXEL1:	add	ecx,8
        jz	ToEnd
        cld
PIXEL:	lodsb			; load 1st byte
	and	al,0F0h
	
	dec	ecx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	ToEnd

NIBBLE2:mov	ah,al
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
ToEnd:	ret                     ; _cdecl return
                
Conv8_4_MMX endp


;*************************************************************************************


        public  Conv8_16_MMX
Conv8_16_MMX proc \
        uses edi esi, \
        Dest:ptr qword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
                
	sub	ecx,4
        jl	PIXEL1
       
PIXEL4:	movd	mm0,dword ptr[esi]			; pixels 1,2,3,4
	add	esi,4
	punpcklbw mm0,mm0
	movq	qword ptr [edi],mm0

	add	edi,8
	sub	ecx,4
        jae	PIXEL4
	emms

PIXEL1: add	ecx,4
        jz	ToEnd		; array has zero size 
        cld
PIXEL:	lodsb
	mov	ah,al
	stosw
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv8_16_MMX endp


;*************************************************************************************


        public  Conv8_32_MMX
Conv8_32_MMX proc \
        uses edi esi, \
        Dest:ptr qword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
                
	sub	ecx,4
        jl	PIXEL1
       
PIXEL4:	movd	mm0,dword ptr[esi]			; pixels 1,2,3,4
	add	esi,4
	punpcklbw mm0,mm0
	movq	mm1,mm0
	punpcklwd mm0,mm0	
	movq	qword ptr [edi],mm0
	punpckhdq mm1,mm1				; 4 3 4 3
	punpcklwd mm1,mm1
	movq	qword ptr [edi+8],mm1

	add	edi,16
	sub	ecx,4
        jae	PIXEL4
	emms

PIXEL1: cld
	add	ecx,4
        jz	ToEnd		; array has zero size 

PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv8_32_MMX endp


;*************************************************************************************

;void Conv8_64_MMX(QWORD *Dest, const BYTE *Src, unsigned Size1D);
        public  Conv8_64_MMX
Conv8_64_MMX proc \
        uses edi esi, \
        Dest:ptr qword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        jecxz	ToEnd		; array has zero size

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
        
        sub	ecx,4
        jl	PIXEL1
        
PIXEL4: movd	mm0,dword ptr[esi]			; pixels 1,2,3,4
	add	esi,4
	punpcklbw mm0,mm0
	movq	mm1,mm0
	punpcklwd mm0,mm0
	movq	mm3,mm0
	punpckldq  mm0,mm3	
	movq	qword ptr [edi],mm0
	movq	mm0,mm3
	punpckhdq  mm0,mm3	
	movq	qword ptr [edi+8],mm0
	
	punpckhdq mm1,mm1				; 4 3 4 3
	punpcklwd mm1,mm1
	movq	mm3,mm1
	punpckldq  mm1,mm3
	movq	qword ptr [edi+16],mm1
	movq	mm0,mm3
	punpckhdq  mm0,mm3	
	movq	qword ptr [edi+24],mm0
	
	add	edi,32
	sub	ecx,4
        jae	PIXEL4
        emms

PIXEL1: add	ecx,4
	jz	toend		; array has zero size
        cld
PIXEL:	lodsb			; Not using multiplication here, it would need to store ebx and speedup effect is negligible.
	mov	ah,al
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	stosd
	loop	PIXEL
        
ToEnd:
        ret                     ; _cdecl return
                
Conv8_64_MMX endp


;*************************************************************************************


        public  Conv16_4_MMX
Conv16_4_MMX proc \
        uses edi esi, \
        Dest:ptr byte, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
        
       	sub	ecx,8
        jl	PIXEL1
        
        mov	eax,00F000F0h
        movd	mm3,eax
        punpckldq mm3,mm3
PIXEL8: movq	mm0,qword ptr [esi]
	psrlw	mm0,8
	movq	mm1,qword ptr [esi+8]
	psrlw	mm1,8
	packuswb mm0,mm1		; 3h 3l|2h 2l|1h 1l|0h 0l	
	movq	mm1,mm0
	add	esi,16
	pand	mm0,mm3			;  -  -  2h -| -  -  0h - 
	psrlw	mm1,12			;  -  -  - 3h| -  -  - 1h
	por	mm0,mm1			;  -  -  2h3h| -  -  0h1h
	packuswb mm0,mm0
	movd	dword ptr [edi],mm0
	add	edi,4
	sub	ecx,8
        jae	PIXEL8
	emms	
        
PIXEL1:	add	ecx,8
        jz	ToEnd
        cld
PIXEL:	inc	esi
	lodsb			; load 1st hi byte
	and	al,0F0h
	
	dec	ecx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	ToEnd

NIBBLE2:mov	ah,al
	inc	si
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
ToEnd:	ret                     ; _cdecl return
                
Conv16_4_MMX endp


;*************************************************************************************

        public  Conv16_8_MMX
Conv16_8_MMX proc \
        uses edi esi, \
        Dest:ptr qword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
                
	sub	ecx,4
        jl	PIXEL1

	;mov	eax,08080808h
	;movd	mm1,eax
PIXEL4:	movq	mm0,qword ptr[esi]			; pixels 1,2,3,4
	add	esi,8
	psrlw	mm0,8
	packuswb mm0,mm0
	movd	dword ptr [edi],mm0

	add	edi,4
	sub	ecx,4
        jae	PIXEL4
	emms

PIXEL1: cld
	add	ecx,4
        jz	ToEnd		; array has zero size 
        
PIXEL:	lodsw
	mov	al,ah
	stosb
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv16_8_MMX endp


;*************************************************************************************


        public  Conv16_32_MMX
Conv16_32_MMX proc \
        uses edi esi, \
        Dest:ptr dword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        jecxz	ToEnd		; array has zero size

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;

	sub	ecx,2
        jl	PIXEL1
        
PIXEL2: movd	mm0,dword ptr[esi]			; pixels 1,2
	add	esi,4
	punpcklwd mm0,mm0
	movq	qword ptr [edi],mm0
	add	edi,8
	sub	ecx,2
        jae	PIXEL2
        emms

PIXEL1: add	ecx,2
	jz	ToEnd
        cld
PIXEL:	lodsw
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
ToEnd:
        ret                     ; _cdecl return
                
Conv16_32_MMX endp


;*************************************************************************************

        public  Conv16_64_MMX
Conv16_64_MMX proc \
        uses edi esi, \
        Dest:ptr dword, \
        Src:ptr byte, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels

        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;
        
        sub	ecx,2
        jl	PIXEL1

PIXEL2: movd	mm0,dword ptr[esi]			; pixels 1,2
	add	esi,4
	punpcklwd mm0,mm0				; 2 2 2 2 1 1 1 1
	movq	mm1,mm0
	punpckldq mm0,mm0
	movq	qword ptr [edi],mm0
	punpckhdq  mm1,mm1
	movq	qword ptr [edi+8],mm1
	
	add	edi,16
	sub	ecx,2
        jae	PIXEL2
        emms

PIXEL1: add	ecx,2
	jz	ToEnd
        cld
PIXEL:	lodsw
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	stosd
	loop	PIXEL
        
ToEnd:
        ret                     ; _cdecl return
                
Conv16_64_MMX endp



;*************************************************************************************


        public  Conv32_16_MMX
Conv32_16_MMX proc \
        uses edi esi, \
        Dest:ptr word, \
        Src:ptr dword, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        mov     edi,[Dest]     ; di=first pointer (es=segment part)
        mov     esi,[Src]      ;

	sub	ecx,2
        jl	PIXEL1
        
PIXEL2: movq	mm0,qword ptr[esi]			; dword pixels 1,2
	add	esi,8
	psrld	mm0,16
	packssdw mm0,mm0
	movd	dword ptr [edi],mm0
	add	edi,4
	sub	ecx,2
        jae	PIXEL2
        emms        

PIXEL1: add	ecx,2
        jz	ToEnd
        cld
PIXEL:	add	esi,2
	movsw
	loop	PIXEL
        
ToEnd:
        ret                     ; _cdecl return
                
Conv32_16_MMX endp


;*************************************************************************************


        public  Conv32_64_MMX
Conv32_64_MMX proc \
        uses esi, \
        Dest:ptr qword, \
        Src:ptr dword, \
        count:DWORD

        mov     ecx,[count]     ; cx=amount of pixels
        jecxz	ToEnd		; array has zero size

        mov     edx,[Dest]     ; di=first pointer (es=segment part)
        or	edx,edx
	jz	ToEnd
        mov     esi,[Src]      ;
        or	esi,esi
	jz	ToEnd

PIXEL:	movd	mm0,dword ptr [esi]
	add	esi,4
	punpckldq  mm0,mm0
	movq	qword ptr [edx],mm0
	add	edx,8
	loop	PIXEL
	emms
        
ToEnd:
        ret                     ; _cdecl return
                
Conv32_64_MMX endp



        end
