diff options
Diffstat (limited to 'source/scale_win.cc')
-rw-r--r-- | source/scale_win.cc | 222 |
1 files changed, 55 insertions, 167 deletions
diff --git a/source/scale_win.cc b/source/scale_win.cc index 840b973..8370ef4 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -105,117 +105,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x1 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -// Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - align 4 - wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] @@ -234,9 +123,8 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, // Blends 32x1 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride @@ -273,9 +161,8 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) -void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr @@ -331,8 +218,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 pand xmm1, xmm5 @@ -366,16 +253,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, [eax + esi * 2] - movdqa xmm3, [eax + esi * 2 + 16] - movdqa xmm4, [eax + edi] - movdqa xmm5, [eax + edi + 16] + movdqu xmm2, [eax + esi * 2] + movdqu xmm3, [eax + esi * 2 + 16] + movdqu xmm4, [eax + edi] + movdqu xmm5, [eax + edi + 16] lea eax, [eax + 32] pavgb xmm2, xmm4 pavgb xmm3, xmm5 @@ -429,8 +316,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm1 palignr xmm1, xmm0, 8 @@ -483,8 +370,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, align 4 wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 @@ -501,8 +388,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm1 pshufb xmm0, xmm4 @@ -542,8 +429,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, align 4 wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm2 @@ -562,8 +449,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -599,8 +486,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, align 4 xloop: - movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 @@ -635,8 +522,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, align 4 xloop: - movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqa xmm6, [eax + esi] + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm6, [eax + esi] movhlps xmm1, xmm0 movhlps xmm7, xmm6 punpcklbw xmm0, xmm5 @@ -645,7 +532,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, punpcklbw xmm7, xmm5 paddusw xmm0, xmm6 paddusw xmm1, xmm7 - movdqa xmm6, [eax + esi * 2] + movdqu xmm6, [eax + esi * 2] lea eax, [eax + 16] movhlps xmm7, xmm6 punpcklbw xmm6, xmm5 @@ -701,9 +588,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, align 4 xloop: - movdqa xmm0, [eax] // average 2 rows into xmm0 - pavgb xmm0, [eax + esi] + movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm1, [eax + esi] lea eax, [eax + 16] + pavgb xmm0, xmm1 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 @@ -750,7 +638,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 xloop: // first row - movdqa xmm0, [esi] + movdqu xmm0, [esi] lea eax, [esi + edx] movdqa xmm1, xmm0 punpcklbw xmm0, xmm4 @@ -763,7 +651,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // sum remaining rows align 4 yloop: - movdqa xmm2, [eax] // read 16 pixels + movdqu xmm2, [eax] // read 16 pixels lea eax, [eax + edx] // advance to next row movdqa xmm3, xmm2 punpcklbw xmm2, xmm4 @@ -775,8 +663,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, align 4 ydone: - movdqa [edi], xmm0 - movdqa [edi + 16], xmm1 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 lea edi, [edi + 32] sub ecx, 16 @@ -891,14 +779,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, align 4 wloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 punpckhbw xmm1, xmm1 sub ecx, 32 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] jg wloop @@ -920,12 +808,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] shufps xmm0, xmm1, 0xdd sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -947,15 +835,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -978,10 +866,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 @@ -990,7 +878,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -1027,7 +915,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -1076,7 +964,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] jg wloop @@ -1267,14 +1155,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, align 4 wloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpckldq xmm0, xmm0 punpckhdq xmm1, xmm1 sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] jg wloop |