OLD | NEW |
1 ; | 1 ; |
2 ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2) | 2 ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
11 ; | 11 ; |
12 ; This file should be assembled with NASM (Netwide Assembler), | 12 ; This file should be assembled with NASM (Netwide Assembler), |
13 ; can *not* be assembled with Microsoft's MASM or any compatible | 13 ; can *not* be assembled with Microsoft's MASM or any compatible |
14 ; assembler (including Borland's Turbo Assembler). | 14 ; assembler (including Borland's Turbo Assembler). |
15 ; NASM is available from http://nasm.sourceforge.net/ for | 15 ; NASM is available from http://nasm.sourceforge.net/ or |
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
17 ; | 17 ; |
18 ; [TAB8] | 18 ; [TAB8] |
19 | 19 |
20 %include "jcolsamp.inc" | 20 %include "jcolsamp.inc" |
21 | 21 |
22 ; -------------------------------------------------------------------------- | 22 ; -------------------------------------------------------------------------- |
23 ; | 23 ; |
24 ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. | 24 ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. |
25 ; | 25 ; |
(...skipping 219 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
245 | 245 |
246 cmp rcx, byte SIZEOF_XMMWORD | 246 cmp rcx, byte SIZEOF_XMMWORD |
247 jb short .column_st32 | 247 jb short .column_st32 |
248 | 248 |
249 test rdi, SIZEOF_XMMWORD-1 | 249 test rdi, SIZEOF_XMMWORD-1 |
250 jnz short .out1 | 250 jnz short .out1 |
251 ; --(aligned)------------------- | 251 ; --(aligned)------------------- |
252 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA | 252 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
253 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD | 253 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
254 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF | 254 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF |
255 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr | |
256 jmp short .out0 | 255 jmp short .out0 |
257 .out1: ; --(unaligned)----------------- | 256 .out1: ; --(unaligned)----------------- |
258 » pcmpeqb xmmH,xmmH» » » ; xmmH=(all 1's) | 257 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
259 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA | 258 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
260 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | 259 » movdqu» XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF |
261 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [rdi], xmmD | |
262 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
263 » maskmovdqu xmmF,xmmH» » » ; movntdqu XMMWORD [rdi], xmmF | |
264 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
265 .out0: | 260 .out0: |
| 261 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
266 sub rcx, byte SIZEOF_XMMWORD | 262 sub rcx, byte SIZEOF_XMMWORD |
267 jz near .endcolumn | 263 jz near .endcolumn |
268 | 264 |
269 add rsi, byte SIZEOF_XMMWORD ; inptr0 | 265 add rsi, byte SIZEOF_XMMWORD ; inptr0 |
270 dec al ; Yctr | 266 dec al ; Yctr |
271 jnz near .Yloop_2nd | 267 jnz near .Yloop_2nd |
272 | 268 |
273 add rbx, byte SIZEOF_XMMWORD ; inptr1 | 269 add rbx, byte SIZEOF_XMMWORD ; inptr1 |
274 add rdx, byte SIZEOF_XMMWORD ; inptr2 | 270 add rdx, byte SIZEOF_XMMWORD ; inptr2 |
275 jmp near .columnloop | 271 jmp near .columnloop |
276 | 272 |
277 .column_st32: | 273 .column_st32: |
278 pcmpeqb xmmH,xmmH ; xmmH=(all 1's) | |
279 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE | 274 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE |
280 cmp rcx, byte 2*SIZEOF_XMMWORD | 275 cmp rcx, byte 2*SIZEOF_XMMWORD |
281 jb short .column_st16 | 276 jb short .column_st16 |
282 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA | 277 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
283 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | 278 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
284 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [rdi], xmmD | 279 » add» rdi, byte 2*SIZEOF_XMMWORD» ; outptr |
285 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
286 movdqa xmmA,xmmF | 280 movdqa xmmA,xmmF |
287 sub rcx, byte 2*SIZEOF_XMMWORD | 281 sub rcx, byte 2*SIZEOF_XMMWORD |
288 jmp short .column_st15 | 282 jmp short .column_st15 |
289 .column_st16: | 283 .column_st16: |
290 cmp rcx, byte SIZEOF_XMMWORD | 284 cmp rcx, byte SIZEOF_XMMWORD |
291 jb short .column_st15 | 285 jb short .column_st15 |
292 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA | 286 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
293 add rdi, byte SIZEOF_XMMWORD ; outptr | 287 add rdi, byte SIZEOF_XMMWORD ; outptr |
294 movdqa xmmA,xmmD | 288 movdqa xmmA,xmmD |
295 sub rcx, byte SIZEOF_XMMWORD | 289 sub rcx, byte SIZEOF_XMMWORD |
296 .column_st15: | 290 .column_st15: |
297 %ifdef STRICT_MEMORY_ACCESS | |
298 ; Store the lower 8 bytes of xmmA to the output when it has enough | 291 ; Store the lower 8 bytes of xmmA to the output when it has enough |
299 ; space. | 292 ; space. |
300 cmp rcx, byte SIZEOF_MMWORD | 293 cmp rcx, byte SIZEOF_MMWORD |
301 jb short .column_st7 | 294 jb short .column_st7 |
302 movq MMWORD [rdi], xmmA | 295 movq MMWORD [rdi], xmmA |
303 add rdi, byte SIZEOF_MMWORD | 296 add rdi, byte SIZEOF_MMWORD |
304 sub rcx, byte SIZEOF_MMWORD | 297 sub rcx, byte SIZEOF_MMWORD |
305 psrldq xmmA, SIZEOF_MMWORD | 298 psrldq xmmA, SIZEOF_MMWORD |
306 .column_st7: | 299 .column_st7: |
307 ; Store the lower 4 bytes of xmmA to the output when it has enough | 300 ; Store the lower 4 bytes of xmmA to the output when it has enough |
(...skipping 13 matching lines...) Expand all Loading... |
321 mov WORD [rdi], ax | 314 mov WORD [rdi], ax |
322 add rdi, byte SIZEOF_WORD | 315 add rdi, byte SIZEOF_WORD |
323 sub rcx, byte SIZEOF_WORD | 316 sub rcx, byte SIZEOF_WORD |
324 shr rax, 16 | 317 shr rax, 16 |
325 .column_st1: | 318 .column_st1: |
326 ; Store the lower 1 byte of rax to the output when it has enough | 319 ; Store the lower 1 byte of rax to the output when it has enough |
327 ; space. | 320 ; space. |
328 test rcx, rcx | 321 test rcx, rcx |
329 jz short .endcolumn | 322 jz short .endcolumn |
330 mov BYTE [rdi], al | 323 mov BYTE [rdi], al |
331 %else | |
332 mov rax,rcx | |
333 xor rcx, byte 0x0F | |
334 shl rcx, 2 | |
335 movd xmmB,ecx | |
336 psrlq xmmH,4 | |
337 pcmpeqb xmmE,xmmE | |
338 psrlq xmmH,xmmB | |
339 psrlq xmmE,xmmB | |
340 punpcklbw xmmE,xmmH | |
341 ; ---------------- | |
342 mov rcx,rdi | |
343 and rcx, byte SIZEOF_XMMWORD-1 | |
344 jz short .adj0 | |
345 add rax,rcx | |
346 cmp rax, byte SIZEOF_XMMWORD | |
347 ja short .adj0 | |
348 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary | |
349 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx | |
350 movdqa xmmG,xmmA | |
351 movdqa xmmC,xmmE | |
352 pslldq xmmA, SIZEOF_XMMWORD/2 | |
353 pslldq xmmE, SIZEOF_XMMWORD/2 | |
354 movd xmmD,ecx | |
355 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT | |
356 jb short .adj1 | |
357 movd xmmF,ecx | |
358 psllq xmmA,xmmF | |
359 psllq xmmE,xmmF | |
360 jmp short .adj0 | |
361 .adj1: neg rcx | |
362 movd xmmF,ecx | |
363 psrlq xmmA,xmmF | |
364 psrlq xmmE,xmmF | |
365 psllq xmmG,xmmD | |
366 psllq xmmC,xmmD | |
367 por xmmA,xmmG | |
368 por xmmE,xmmC | |
369 .adj0: ; ---------------- | |
370 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA | |
371 %endif ; STRICT_MEMORY_ACCESS ; --------------- | |
372 | 324 |
373 %else ; RGB_PIXELSIZE == 4 ; ----------- | 325 %else ; RGB_PIXELSIZE == 4 ; ----------- |
374 | 326 |
375 %ifdef RGBX_FILLER_0XFF | 327 %ifdef RGBX_FILLER_0XFF |
376 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) | 328 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) |
377 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) | 329 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) |
378 %else | 330 %else |
379 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) | 331 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) |
380 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) | 332 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) |
381 %endif | 333 %endif |
(...skipping 24 matching lines...) Expand all Loading... |
406 cmp rcx, byte SIZEOF_XMMWORD | 358 cmp rcx, byte SIZEOF_XMMWORD |
407 jb short .column_st32 | 359 jb short .column_st32 |
408 | 360 |
409 test rdi, SIZEOF_XMMWORD-1 | 361 test rdi, SIZEOF_XMMWORD-1 |
410 jnz short .out1 | 362 jnz short .out1 |
411 ; --(aligned)------------------- | 363 ; --(aligned)------------------- |
412 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA | 364 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
413 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD | 365 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
414 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC | 366 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC |
415 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH | 367 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH |
416 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr | |
417 jmp short .out0 | 368 jmp short .out0 |
418 .out1: ; --(unaligned)----------------- | 369 .out1: ; --(unaligned)----------------- |
419 » pcmpeqb xmmE,xmmE» » » ; xmmE=(all 1's) | 370 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
420 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA | 371 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
421 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | 372 » movdqu» XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC |
422 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [rdi], xmmD | 373 » movdqu» XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH |
423 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
424 » maskmovdqu xmmC,xmmE» » » ; movntdqu XMMWORD [rdi], xmmC | |
425 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
426 » maskmovdqu xmmH,xmmE» » » ; movntdqu XMMWORD [rdi], xmmH | |
427 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
428 .out0: | 374 .out0: |
| 375 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
429 sub rcx, byte SIZEOF_XMMWORD | 376 sub rcx, byte SIZEOF_XMMWORD |
430 jz near .endcolumn | 377 jz near .endcolumn |
431 | 378 |
432 add rsi, byte SIZEOF_XMMWORD ; inptr0 | 379 add rsi, byte SIZEOF_XMMWORD ; inptr0 |
433 dec al ; Yctr | 380 dec al ; Yctr |
434 jnz near .Yloop_2nd | 381 jnz near .Yloop_2nd |
435 | 382 |
436 add rbx, byte SIZEOF_XMMWORD ; inptr1 | 383 add rbx, byte SIZEOF_XMMWORD ; inptr1 |
437 add rdx, byte SIZEOF_XMMWORD ; inptr2 | 384 add rdx, byte SIZEOF_XMMWORD ; inptr2 |
438 jmp near .columnloop | 385 jmp near .columnloop |
439 | 386 |
440 .column_st32: | 387 .column_st32: |
441 pcmpeqb xmmE,xmmE ; xmmE=(all 1's) | |
442 cmp rcx, byte SIZEOF_XMMWORD/2 | 388 cmp rcx, byte SIZEOF_XMMWORD/2 |
443 jb short .column_st16 | 389 jb short .column_st16 |
444 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA | 390 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
445 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | 391 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
446 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [rdi], xmmD | 392 » add» rdi, byte 2*SIZEOF_XMMWORD» ; outptr |
447 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
448 movdqa xmmA,xmmC | 393 movdqa xmmA,xmmC |
449 movdqa xmmD,xmmH | 394 movdqa xmmD,xmmH |
450 sub rcx, byte SIZEOF_XMMWORD/2 | 395 sub rcx, byte SIZEOF_XMMWORD/2 |
451 .column_st16: | 396 .column_st16: |
452 cmp rcx, byte SIZEOF_XMMWORD/4 | 397 cmp rcx, byte SIZEOF_XMMWORD/4 |
453 jb short .column_st15 | 398 jb short .column_st15 |
454 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA | 399 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
455 add rdi, byte SIZEOF_XMMWORD ; outptr | 400 add rdi, byte SIZEOF_XMMWORD ; outptr |
456 movdqa xmmA,xmmD | 401 movdqa xmmA,xmmD |
457 sub rcx, byte SIZEOF_XMMWORD/4 | 402 sub rcx, byte SIZEOF_XMMWORD/4 |
458 .column_st15: | 403 .column_st15: |
459 %ifdef STRICT_MEMORY_ACCESS | |
460 ; Store two pixels (8 bytes) of xmmA to the output when it has enough | 404 ; Store two pixels (8 bytes) of xmmA to the output when it has enough |
461 ; space. | 405 ; space. |
462 cmp rcx, byte SIZEOF_XMMWORD/8 | 406 cmp rcx, byte SIZEOF_XMMWORD/8 |
463 jb short .column_st7 | 407 jb short .column_st7 |
464 movq MMWORD [rdi], xmmA | 408 movq MMWORD [rdi], xmmA |
465 add rdi, byte SIZEOF_XMMWORD/8*4 | 409 add rdi, byte SIZEOF_XMMWORD/8*4 |
466 sub rcx, byte SIZEOF_XMMWORD/8 | 410 sub rcx, byte SIZEOF_XMMWORD/8 |
467 psrldq xmmA, SIZEOF_XMMWORD/8*4 | 411 psrldq xmmA, SIZEOF_XMMWORD/8*4 |
468 .column_st7: | 412 .column_st7: |
469 ; Store one pixel (4 bytes) of xmmA to the output when it has enough | 413 ; Store one pixel (4 bytes) of xmmA to the output when it has enough |
470 ; space. | 414 ; space. |
471 test rcx, rcx | 415 test rcx, rcx |
472 jz short .endcolumn | 416 jz short .endcolumn |
473 movd DWORD [rdi], xmmA | 417 movd DWORD [rdi], xmmA |
474 %else | |
475 cmp rcx, byte SIZEOF_XMMWORD/16 | |
476 jb near .endcolumn | |
477 mov rax,rcx | |
478 xor rcx, byte 0x03 | |
479 inc rcx | |
480 shl rcx, 4 | |
481 movd xmmF,ecx | |
482 psrlq xmmE,xmmF | |
483 punpcklbw xmmE,xmmE | |
484 ; ---------------- | |
485 mov rcx,rdi | |
486 and rcx, byte SIZEOF_XMMWORD-1 | |
487 jz short .adj0 | |
488 lea rax, [rcx+rax*4] ; RGB_PIXELSIZE | |
489 cmp rax, byte SIZEOF_XMMWORD | |
490 ja short .adj0 | |
491 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary | |
492 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx | |
493 movdqa xmmB,xmmA | |
494 movdqa xmmG,xmmE | |
495 pslldq xmmA, SIZEOF_XMMWORD/2 | |
496 pslldq xmmE, SIZEOF_XMMWORD/2 | |
497 movd xmmC,ecx | |
498 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT | |
499 jb short .adj1 | |
500 movd xmmH,ecx | |
501 psllq xmmA,xmmH | |
502 psllq xmmE,xmmH | |
503 jmp short .adj0 | |
504 .adj1: neg rcx | |
505 movd xmmH,ecx | |
506 psrlq xmmA,xmmH | |
507 psrlq xmmE,xmmH | |
508 psllq xmmB,xmmC | |
509 psllq xmmG,xmmC | |
510 por xmmA,xmmB | |
511 por xmmE,xmmG | |
512 .adj0: ; ---------------- | |
513 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA | |
514 %endif ; STRICT_MEMORY_ACCESS ; --------------- | |
515 | 418 |
516 %endif ; RGB_PIXELSIZE ; --------------- | 419 %endif ; RGB_PIXELSIZE ; --------------- |
517 | 420 |
518 .endcolumn: | 421 .endcolumn: |
519 sfence ; flush the write buffer | 422 sfence ; flush the write buffer |
520 | 423 |
521 .return: | 424 .return: |
522 pop rbx | 425 pop rbx |
523 uncollect_args | 426 uncollect_args |
524 mov rsp,rbp ; rsp <- aligned rbp | 427 mov rsp,rbp ; rsp <- aligned rbp |
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
626 pop rdx | 529 pop rdx |
627 | 530 |
628 pop rbx | 531 pop rbx |
629 uncollect_args | 532 uncollect_args |
630 pop rbp | 533 pop rbp |
631 ret | 534 ret |
632 | 535 |
633 ; For some reason, the OS X linker does not honor the request to align the | 536 ; For some reason, the OS X linker does not honor the request to align the |
634 ; segment unless we do this. | 537 ; segment unless we do this. |
635 align 16 | 538 align 16 |
OLD | NEW |