这两天等待应聘的offer,闲着无聊,回忆起1年多以前做的东西,mpeg4标准的rtp流,收到后在本地回放 我使用的是ddraw来显示,1开始为了先显示出东西进行调试,并且当时重点在同步以及接受流,所以当时用了yuv2bmp 这个东西;把yv12转换成了bmp,然后1桢1桢贴到窗口dc上。呵呵,效果可想;) 经过了矩阵转换又是draw->dc 后来项目进入优化阶段,决定用ddraw的overlay来显示yv12视频。具体的方法就是参照Dxsdk7里的“蚊子”程序。 把yv12直接传到overlay显示. ;) 蚊子现实的是yuv422,和yv12相比较,在逐行memcpy到pSurf的时候排列不同而已,具体参照yuv标准就行了
(简介ddraw的实用,dxsdk7是ddraw的最后版本了)
yuv2bmp的实现: void yuv2rgb_32(uint8_t *puc_y, int stride_y, uint8_t *puc_u, uint8_t *puc_v, int stride_uv, uint8_t *puc_out, int width_y, int height_y, unsigned int _stride_out) { /* int x, y; int stride_diff = 4 * (_stride_out - width_y);
if (height_y < 0) { // we are flipping our output upside-down height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += (height_y/2 - 1) * stride_uv; puc_v += (height_y/2 - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; }
for (y=0; y<height_y; y++) { for (x=0; x<width_y; x++) { signed int _r,_g,_b; signed int r, g, b; signed int y, u, v;
y = puc_y[x] +10;//- 16; u = puc_u[x>>1]-128; v = puc_v[x>>1]-128;
_r = _R(y,u,v); _g = _G(y,u,v); _b = _B(y,u,v);
r = _S(_r); g = _S(_g); b = _S(_b);
puc_out[0] = r; puc_out[1] = g; puc_out[2] = b; puc_out[3] = 0;
puc_out+=4; }
puc_y += stride_y; if (y%2) { puc_u += stride_uv; puc_v += stride_uv; } puc_out += stride_diff; }*/
///////////// Intel MMX ///////////////
int y, horiz_count; int stride_out = width_y <<2;
if (height_y < 0) { // we are flipping our output upside-down height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += ((height_y>>1) - 1) * stride_uv; puc_v += ((height_y>>1) - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; }
horiz_count = -(width_y >> 3);
for (y=0; y<height_y; y++) { _asm { push eax push ebx push ecx push edx push edi
mov eax, puc_out mov ebx, puc_y mov ecx, puc_u mov edx, puc_v mov edi, horiz_count horiz_loop:
movd mm2, [ecx] pxor mm7, mm7
movd mm3, [edx] punpcklbw mm2, mm7 ; mm2 = __u3__u2__u1__u0
movq mm0, [ebx] ; mm0 = y7y6y5y4y3y2y1y0 punpcklbw mm3, mm7 ; mm3 = __v3__v2__v1__v0
movq mm1, mmw_0x00ff ; mm1 = 00ff00ff00ff00ff
psubusb mm0, mmb_0x10 ; mm0 -= 16
psubw mm2, mmw_0x0080 ; mm2 -= 128 pand mm1, mm0 ; mm1 = __y6__y4__y2__y0
psubw mm3, mmw_0x0080 ; mm3 -= 128 psllw mm1, 3 ; mm1 *= 8
psrlw mm0, 8 ; mm0 = __y7__y5__y3__y1 psllw mm2, 3 ; mm2 *= 8
pmulhw mm1, mmw_mult_Y ; mm1 *= luma coeff psllw mm0, 3 ; mm0 *= 8
psllw mm3, 3 ; mm3 *= 8 movq mm5, mm3 ; mm5 = mm3 = v
pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma movq mm4, mm2 ; mm4 = mm2 = u
pmulhw mm0, mmw_mult_Y ; mm0 *= luma coeff movq mm7, mm1 ; even luma part
pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff paddsw mm7, mm5 ; mm7 = luma + chroma __r6__r4__r2__r0
pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff packuswb mm7, mm7 ; mm7 = r6r4r2r0r6r4r2r0
pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma paddsw mm5, mm0 ; mm5 = luma + chroma __r7__r5__r3__r1
packuswb mm5, mm5 ; mm6 = r7r5r3r1r7r5r3r1 paddsw mm2, mm3 ; mm2 = green chroma
movq mm3, mm1 ; mm3 = __y6__y4__y2__y0 movq mm6, mm1 ; mm6 = __y6__y4__y2__y0
paddsw mm3, mm4 ; mm3 = luma + chroma __b6__b4__b2__b0 paddsw mm6, mm2 ; mm6 = luma + chroma __g6__g4__g2__g0 punpcklbw mm7, mm5 ; mm7 = r7r6r5r4r3r2r1r0 paddsw mm2, mm0 ; odd luma part plus chroma part __g7__g5__g3__g1
packuswb mm6, mm6 ; mm2 = g6g4g2g0g6g4g2g0 packuswb mm2, mm2 ; mm2 = g7g5g3g1g7g5g3g1
packuswb mm3, mm3 ; mm3 = b6b4b2b0b6b4b2b0 paddsw mm4, mm0 ; odd luma part plus chroma part __b7__b5__b3__b1
packuswb mm4, mm4 ; mm4 = b7b5b3b1b7b5b3b1 punpcklbw mm6, mm2 ; mm6 = g7g6g5g4g3g2g1g0
punpcklbw mm3, mm4 ; mm3 = b7b6b5b4b3b2b1b0
// 32-bit shuffle.... pxor mm0, mm0 ; is this needed?
movq mm1, mm6 ; mm1 = g7g6g5g4g3g2g1g0 punpcklbw mm1, mm0 ; mm1 = __g3__g2__g1__g0
movq mm0, mm3 ; mm0 = b7b6b5b4b3b2b1b0 punpcklbw mm0, mm7 ; mm0 = r3b3r2b2r1b1r0b0
movq mm2, mm0 ; mm2 = r3b3r2b2r1b1r0b0
punpcklbw mm0, mm1 ; mm0 = __r1g1b1__r0g0b0 punpckhbw mm2, mm1 ; mm2 = __r3g3b3__r2g2b2
// 32-bit save... movq [eax], mm0 ; eax[0] = __r1g1b1__r0g0b0 movq mm1, mm6 ; mm1 = g7g6g5g4g3g2g1g0
movq 8[eax], mm2 ; eax[8] = __r3g3b3__r2g2b2
// 32-bit shuffle.... pxor mm0, mm0 ; is this needed?
punpckhbw mm1, mm0 ; mm1 = __g7__g6__g5__g4
movq mm0, mm3 ; mm0 = b7b6b5b4b3b2b1b0 punpckhbw mm0, mm7 ; mm0 = r7b7r6b6r5b5r4b4
movq mm2, mm0 ; mm2 = r7b7r6b6r5b5r4b4
punpcklbw mm0, mm1 ; mm0 = __r5g5b5__r4g4b4 punpckhbw mm2, mm1 ; mm2 = __r7g7b7__r6g6b6
//32-bit save... add ebx, 8 ; puc_y += 8; add ecx, 4 ; puc_u += 4;
movq 16[eax], mm0 ; eax[16] = __r5g5b5__r4g4b4 add edx, 4 ; puc_v += 4;
movq 24[eax], mm2 ; eax[24] = __r7g7b7__r6g6b6 // 0 1 2 3 4 5 6 7 rgb save order
add eax, 32 ; puc_out += 32
inc edi jne horiz_loop
pop edi pop edx pop ecx pop ebx pop eax
emms }
puc_y += stride_y; if (y&0x01){//%2) { puc_u += stride_uv; puc_v += stride_uv; } puc_out += stride_out; } }

|