#include <stdio.h>

// A matrix
short s1[16] = {
	 1,	 2,	 3,	 4,
	 5,	 6,	 7,	 8,	
	 9,	10,	11,	12,	
	13,	14,	15,	16,
};

// Transpose(B matrix)
short s2[16] = {	
	17,	21,	25,	29,
	18,	22,	26,	30,	
	19,	23,	27,	31,	
	20,	24,	28,	32	
};


// Destination matrix
short d[16];

int j, i;
int main( int argc, char** argv ){

	__asm__("xorl	%ecx, %ecx");

	__asm__(".START:");
	__asm__("movq s1(,%ecx,8), %mm0" );
	__asm__("movq %mm0, %mm1" );
	__asm__("movq %mm0, %mm2" );
	__asm__("punpckhdq %mm2, %mm0" );
	__asm__("punpckldq %mm2, %mm1" );

	__asm__("movq %mm0, %mm6");
	__asm__("movq %mm1, %mm7");

	__asm__("movq (s2), %mm2" );
	__asm__("mov $1, %eax" );
	__asm__("movq s2(,%eax,8), %mm4");

	__asm__("movq %mm2, %mm3" );
	__asm__("punpckhdq %mm4, %mm2");
	__asm__("punpckldq %mm4, %mm3");

	__asm__("pmaddwd %mm2, %mm0");
	__asm__("pmaddwd %mm3, %mm1");

	__asm__("paddw	%mm1, %mm0");

	__asm__("movq %mm6, %mm1");
	__asm__("movq %mm7, %mm2");

	__asm__("mov $2, %eax" );
	__asm__("movq s2(,%eax,8), %mm3" );
	__asm__("mov $3, %eax" );
	__asm__("movq s2(,%eax,8), %mm5");
	__asm__("movq %mm3, %mm4" );
	__asm__("punpckhdq %mm5, %mm3");
	__asm__("punpckldq %mm5, %mm4");

	__asm__("pmaddwd %mm3, %mm1");
	__asm__("pmaddwd %mm4, %mm2");

	__asm__("paddw %mm2, %mm1");

	__asm__("packssdw %mm1, %mm0");
	__asm__("movq %mm0, d(,%ecx,8)");

	__asm__("incl	%ecx");
	__asm__("cmpl	$4, %ecx");
	__asm__("jne	.START");

	for( j = 0 ; j < 4 ; j++ ){
		for( i = 0 ; i < 4 ; i++ ){
			fprintf( stderr, "\t%3d", d[j*4+i] );
		}
		fprintf( stderr, "\n" );
	}

	return 0;

}

