OLD | NEW |
1 ; | 1 ; |
2 ; jdmrgss2.asm - merged upsampling/color conversion (SSE2) | 2 ; jdmrgss2.asm - merged upsampling/color conversion (SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; | 5 ; |
6 ; Based on | 6 ; Based on |
7 ; x86 SIMD extension for IJG JPEG library | 7 ; x86 SIMD extension for IJG JPEG library |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
10 ; | 10 ; |
11 ; This file should be assembled with NASM (Netwide Assembler), | 11 ; This file should be assembled with NASM (Netwide Assembler), |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | 12 ; can *not* be assembled with Microsoft's MASM or any compatible |
13 ; assembler (including Borland's Turbo Assembler). | 13 ; assembler (including Borland's Turbo Assembler). |
14 ; NASM is available from http://nasm.sourceforge.net/ or | 14 ; NASM is available from http://nasm.sourceforge.net/ or |
(...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
257 | 257 |
258 cmp ecx, byte SIZEOF_XMMWORD | 258 cmp ecx, byte SIZEOF_XMMWORD |
259 jb short .column_st32 | 259 jb short .column_st32 |
260 | 260 |
261 test edi, SIZEOF_XMMWORD-1 | 261 test edi, SIZEOF_XMMWORD-1 |
262 jnz short .out1 | 262 jnz short .out1 |
263 ; --(aligned)------------------- | 263 ; --(aligned)------------------- |
264 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA | 264 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
265 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD | 265 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
266 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF | 266 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF |
267 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr | |
268 jmp short .out0 | 267 jmp short .out0 |
269 .out1: ; --(unaligned)----------------- | 268 .out1: ; --(unaligned)----------------- |
270 » pcmpeqb xmmH,xmmH» » » ; xmmH=(all 1's) | 269 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
271 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA | 270 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
272 » add» edi, byte SIZEOF_XMMWORD» ; outptr | 271 » movdqu» XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF |
273 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [edi], xmmD | |
274 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
275 » maskmovdqu xmmF,xmmH» » » ; movntdqu XMMWORD [edi], xmmF | |
276 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
277 .out0: | 272 .out0: |
| 273 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
278 sub ecx, byte SIZEOF_XMMWORD | 274 sub ecx, byte SIZEOF_XMMWORD |
279 jz near .endcolumn | 275 jz near .endcolumn |
280 | 276 |
281 add esi, byte SIZEOF_XMMWORD ; inptr0 | 277 add esi, byte SIZEOF_XMMWORD ; inptr0 |
282 dec al ; Yctr | 278 dec al ; Yctr |
283 jnz near .Yloop_2nd | 279 jnz near .Yloop_2nd |
284 | 280 |
285 add ebx, byte SIZEOF_XMMWORD ; inptr1 | 281 add ebx, byte SIZEOF_XMMWORD ; inptr1 |
286 add edx, byte SIZEOF_XMMWORD ; inptr2 | 282 add edx, byte SIZEOF_XMMWORD ; inptr2 |
287 jmp near .columnloop | 283 jmp near .columnloop |
288 alignx 16,7 | 284 alignx 16,7 |
289 | 285 |
290 .column_st32: | 286 .column_st32: |
291 pcmpeqb xmmH,xmmH ; xmmH=(all 1's) | |
292 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE | 287 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE |
293 cmp ecx, byte 2*SIZEOF_XMMWORD | 288 cmp ecx, byte 2*SIZEOF_XMMWORD |
294 jb short .column_st16 | 289 jb short .column_st16 |
295 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA | 290 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
296 » add» edi, byte SIZEOF_XMMWORD» ; outptr | 291 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
297 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [edi], xmmD | 292 » add» edi, byte 2*SIZEOF_XMMWORD» ; outptr |
298 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
299 movdqa xmmA,xmmF | 293 movdqa xmmA,xmmF |
300 sub ecx, byte 2*SIZEOF_XMMWORD | 294 sub ecx, byte 2*SIZEOF_XMMWORD |
301 jmp short .column_st15 | 295 jmp short .column_st15 |
302 .column_st16: | 296 .column_st16: |
303 cmp ecx, byte SIZEOF_XMMWORD | 297 cmp ecx, byte SIZEOF_XMMWORD |
304 jb short .column_st15 | 298 jb short .column_st15 |
305 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA | 299 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
306 add edi, byte SIZEOF_XMMWORD ; outptr | 300 add edi, byte SIZEOF_XMMWORD ; outptr |
307 movdqa xmmA,xmmD | 301 movdqa xmmA,xmmD |
308 sub ecx, byte SIZEOF_XMMWORD | 302 sub ecx, byte SIZEOF_XMMWORD |
309 .column_st15: | 303 .column_st15: |
310 %ifdef STRICT_MEMORY_ACCESS | |
311 ; Store the lower 8 bytes of xmmA to the output when it has enough | 304 ; Store the lower 8 bytes of xmmA to the output when it has enough |
312 ; space. | 305 ; space. |
313 cmp ecx, byte SIZEOF_MMWORD | 306 cmp ecx, byte SIZEOF_MMWORD |
314 jb short .column_st7 | 307 jb short .column_st7 |
315 movq MMWORD [edi], xmmA | 308 movq MMWORD [edi], xmmA |
316 add edi, byte SIZEOF_MMWORD | 309 add edi, byte SIZEOF_MMWORD |
317 sub ecx, byte SIZEOF_MMWORD | 310 sub ecx, byte SIZEOF_MMWORD |
318 psrldq xmmA, SIZEOF_MMWORD | 311 psrldq xmmA, SIZEOF_MMWORD |
319 .column_st7: | 312 .column_st7: |
320 ; Store the lower 4 bytes of xmmA to the output when it has enough | 313 ; Store the lower 4 bytes of xmmA to the output when it has enough |
(...skipping 13 matching lines...) Expand all Loading... |
334 mov WORD [edi], ax | 327 mov WORD [edi], ax |
335 add edi, byte SIZEOF_WORD | 328 add edi, byte SIZEOF_WORD |
336 sub ecx, byte SIZEOF_WORD | 329 sub ecx, byte SIZEOF_WORD |
337 shr eax, 16 | 330 shr eax, 16 |
338 .column_st1: | 331 .column_st1: |
339 ; Store the lower 1 byte of eax to the output when it has enough | 332 ; Store the lower 1 byte of eax to the output when it has enough |
340 ; space. | 333 ; space. |
341 test ecx, ecx | 334 test ecx, ecx |
342 jz short .endcolumn | 335 jz short .endcolumn |
343 mov BYTE [edi], al | 336 mov BYTE [edi], al |
344 %else | |
345 mov eax,ecx | |
346 xor ecx, byte 0x0F | |
347 shl ecx, 2 | |
348 movd xmmB,ecx | |
349 psrlq xmmH,4 | |
350 pcmpeqb xmmE,xmmE | |
351 psrlq xmmH,xmmB | |
352 psrlq xmmE,xmmB | |
353 punpcklbw xmmE,xmmH | |
354 ; ---------------- | |
355 mov ecx,edi | |
356 and ecx, byte SIZEOF_XMMWORD-1 | |
357 jz short .adj0 | |
358 add eax,ecx | |
359 cmp eax, byte SIZEOF_XMMWORD | |
360 ja short .adj0 | |
361 and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary | |
362 shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx | |
363 movdqa xmmG,xmmA | |
364 movdqa xmmC,xmmE | |
365 pslldq xmmA, SIZEOF_XMMWORD/2 | |
366 pslldq xmmE, SIZEOF_XMMWORD/2 | |
367 movd xmmD,ecx | |
368 sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT | |
369 jb short .adj1 | |
370 movd xmmF,ecx | |
371 psllq xmmA,xmmF | |
372 psllq xmmE,xmmF | |
373 jmp short .adj0 | |
374 .adj1: neg ecx | |
375 movd xmmF,ecx | |
376 psrlq xmmA,xmmF | |
377 psrlq xmmE,xmmF | |
378 psllq xmmG,xmmD | |
379 psllq xmmC,xmmD | |
380 por xmmA,xmmG | |
381 por xmmE,xmmC | |
382 .adj0: ; ---------------- | |
383 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA | |
384 %endif ; STRICT_MEMORY_ACCESS ; --------------- | |
385 | 337 |
386 %else ; RGB_PIXELSIZE == 4 ; ----------- | 338 %else ; RGB_PIXELSIZE == 4 ; ----------- |
387 | 339 |
388 %ifdef RGBX_FILLER_0XFF | 340 %ifdef RGBX_FILLER_0XFF |
389 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) | 341 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) |
390 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) | 342 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) |
391 %else | 343 %else |
392 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) | 344 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) |
393 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) | 345 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) |
394 %endif | 346 %endif |
(...skipping 24 matching lines...) Expand all Loading... |
419 cmp ecx, byte SIZEOF_XMMWORD | 371 cmp ecx, byte SIZEOF_XMMWORD |
420 jb short .column_st32 | 372 jb short .column_st32 |
421 | 373 |
422 test edi, SIZEOF_XMMWORD-1 | 374 test edi, SIZEOF_XMMWORD-1 |
423 jnz short .out1 | 375 jnz short .out1 |
424 ; --(aligned)------------------- | 376 ; --(aligned)------------------- |
425 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA | 377 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
426 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD | 378 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
427 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC | 379 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC |
428 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH | 380 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH |
429 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr | |
430 jmp short .out0 | 381 jmp short .out0 |
431 .out1: ; --(unaligned)----------------- | 382 .out1: ; --(unaligned)----------------- |
432 » pcmpeqb xmmE,xmmE» » » ; xmmE=(all 1's) | 383 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
433 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA | 384 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
434 » add» edi, byte SIZEOF_XMMWORD» ; outptr | 385 » movdqu» XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC |
435 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [edi], xmmD | 386 » movdqu» XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH |
436 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
437 » maskmovdqu xmmC,xmmE» » » ; movntdqu XMMWORD [edi], xmmC | |
438 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
439 » maskmovdqu xmmH,xmmE» » » ; movntdqu XMMWORD [edi], xmmH | |
440 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
441 .out0: | 387 .out0: |
| 388 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
442 sub ecx, byte SIZEOF_XMMWORD | 389 sub ecx, byte SIZEOF_XMMWORD |
443 jz near .endcolumn | 390 jz near .endcolumn |
444 | 391 |
445 add esi, byte SIZEOF_XMMWORD ; inptr0 | 392 add esi, byte SIZEOF_XMMWORD ; inptr0 |
446 dec al ; Yctr | 393 dec al ; Yctr |
447 jnz near .Yloop_2nd | 394 jnz near .Yloop_2nd |
448 | 395 |
449 add ebx, byte SIZEOF_XMMWORD ; inptr1 | 396 add ebx, byte SIZEOF_XMMWORD ; inptr1 |
450 add edx, byte SIZEOF_XMMWORD ; inptr2 | 397 add edx, byte SIZEOF_XMMWORD ; inptr2 |
451 jmp near .columnloop | 398 jmp near .columnloop |
452 alignx 16,7 | 399 alignx 16,7 |
453 | 400 |
454 .column_st32: | 401 .column_st32: |
455 pcmpeqb xmmE,xmmE ; xmmE=(all 1's) | |
456 cmp ecx, byte SIZEOF_XMMWORD/2 | 402 cmp ecx, byte SIZEOF_XMMWORD/2 |
457 jb short .column_st16 | 403 jb short .column_st16 |
458 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA | 404 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
459 » add» edi, byte SIZEOF_XMMWORD» ; outptr | 405 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
460 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [edi], xmmD | 406 » add» edi, byte 2*SIZEOF_XMMWORD» ; outptr |
461 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
462 movdqa xmmA,xmmC | 407 movdqa xmmA,xmmC |
463 movdqa xmmD,xmmH | 408 movdqa xmmD,xmmH |
464 sub ecx, byte SIZEOF_XMMWORD/2 | 409 sub ecx, byte SIZEOF_XMMWORD/2 |
465 .column_st16: | 410 .column_st16: |
466 cmp ecx, byte SIZEOF_XMMWORD/4 | 411 cmp ecx, byte SIZEOF_XMMWORD/4 |
467 jb short .column_st15 | 412 jb short .column_st15 |
468 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA | 413 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
469 add edi, byte SIZEOF_XMMWORD ; outptr | 414 add edi, byte SIZEOF_XMMWORD ; outptr |
470 movdqa xmmA,xmmD | 415 movdqa xmmA,xmmD |
471 sub ecx, byte SIZEOF_XMMWORD/4 | 416 sub ecx, byte SIZEOF_XMMWORD/4 |
472 .column_st15: | 417 .column_st15: |
473 %ifdef STRICT_MEMORY_ACCESS | |
474 ; Store two pixels (8 bytes) of xmmA to the output when it has enough | 418 ; Store two pixels (8 bytes) of xmmA to the output when it has enough |
475 ; space. | 419 ; space. |
476 cmp ecx, byte SIZEOF_XMMWORD/8 | 420 cmp ecx, byte SIZEOF_XMMWORD/8 |
477 jb short .column_st7 | 421 jb short .column_st7 |
478 movq MMWORD [edi], xmmA | 422 movq MMWORD [edi], xmmA |
479 » add» edi, byte SIZEOF_XMMWORD/2 | 423 » add» edi, byte SIZEOF_XMMWORD/8*4 |
480 sub ecx, byte SIZEOF_XMMWORD/8 | 424 sub ecx, byte SIZEOF_XMMWORD/8 |
481 » psrldq» xmmA, 64 | 425 » psrldq» xmmA, SIZEOF_XMMWORD/8*4 |
482 .column_st7: | 426 .column_st7: |
483 ; Store one pixel (4 bytes) of xmmA to the output when it has enough | 427 ; Store one pixel (4 bytes) of xmmA to the output when it has enough |
484 ; space. | 428 ; space. |
485 test ecx, ecx | 429 test ecx, ecx |
486 jz short .endcolumn | 430 jz short .endcolumn |
487 movd DWORD [edi], xmmA | 431 movd DWORD [edi], xmmA |
488 %else | |
489 cmp ecx, byte SIZEOF_XMMWORD/16 | |
490 jb short .endcolumn | |
491 mov eax,ecx | |
492 xor ecx, byte 0x03 | |
493 inc ecx | |
494 shl ecx, 4 | |
495 movd xmmF,ecx | |
496 psrlq xmmE,xmmF | |
497 punpcklbw xmmE,xmmE | |
498 ; ---------------- | |
499 mov ecx,edi | |
500 and ecx, byte SIZEOF_XMMWORD-1 | |
501 jz short .adj0 | |
502 lea eax, [ecx+eax*4] ; RGB_PIXELSIZE | |
503 cmp eax, byte SIZEOF_XMMWORD | |
504 ja short .adj0 | |
505 and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary | |
506 shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx | |
507 movdqa xmmB,xmmA | |
508 movdqa xmmG,xmmE | |
509 pslldq xmmA, SIZEOF_XMMWORD/2 | |
510 pslldq xmmE, SIZEOF_XMMWORD/2 | |
511 movd xmmC,ecx | |
512 sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT | |
513 jb short .adj1 | |
514 movd xmmH,ecx | |
515 psllq xmmA,xmmH | |
516 psllq xmmE,xmmH | |
517 jmp short .adj0 | |
518 .adj1: neg ecx | |
519 movd xmmH,ecx | |
520 psrlq xmmA,xmmH | |
521 psrlq xmmE,xmmH | |
522 psllq xmmB,xmmC | |
523 psllq xmmG,xmmC | |
524 por xmmA,xmmB | |
525 por xmmE,xmmG | |
526 .adj0: ; ---------------- | |
527 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA | |
528 %endif ; STRICT_MEMORY_ACCESS ; --------------- | |
529 | 432 |
530 %endif ; RGB_PIXELSIZE ; --------------- | 433 %endif ; RGB_PIXELSIZE ; --------------- |
531 | 434 |
532 .endcolumn: | 435 .endcolumn: |
533 sfence ; flush the write buffer | 436 sfence ; flush the write buffer |
534 | 437 |
535 .return: | 438 .return: |
536 pop edi | 439 pop edi |
537 pop esi | 440 pop esi |
538 ; pop edx ; need not be preserved | 441 ; pop edx ; need not be preserved |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
606 pop esi | 509 pop esi |
607 ; pop edx ; need not be preserved | 510 ; pop edx ; need not be preserved |
608 ; pop ecx ; need not be preserved | 511 ; pop ecx ; need not be preserved |
609 pop ebx | 512 pop ebx |
610 pop ebp | 513 pop ebp |
611 ret | 514 ret |
612 | 515 |
613 ; For some reason, the OS X linker does not honor the request to align the | 516 ; For some reason, the OS X linker does not honor the request to align the |
614 ; segment unless we do this. | 517 ; segment unless we do this. |
615 align 16 | 518 align 16 |
OLD | NEW |