OLD | NEW |
1 ; | 1 ; |
2 ; jdclrss2.asm - colorspace conversion (SSE2) | 2 ; jdclrss2.asm - colorspace conversion (SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; | 5 ; |
6 ; Based on | 6 ; Based on |
7 ; x86 SIMD extension for IJG JPEG library | 7 ; x86 SIMD extension for IJG JPEG library |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
10 ; | 10 ; |
11 ; This file should be assembled with NASM (Netwide Assembler), | 11 ; This file should be assembled with NASM (Netwide Assembler), |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | 12 ; can *not* be assembled with Microsoft's MASM or any compatible |
13 ; assembler (including Borland's Turbo Assembler). | 13 ; assembler (including Borland's Turbo Assembler). |
14 ; NASM is available from http://nasm.sourceforge.net/ or | 14 ; NASM is available from http://nasm.sourceforge.net/ or |
(...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
255 | 255 |
256 cmp ecx, byte SIZEOF_XMMWORD | 256 cmp ecx, byte SIZEOF_XMMWORD |
257 jb short .column_st32 | 257 jb short .column_st32 |
258 | 258 |
259 test edi, SIZEOF_XMMWORD-1 | 259 test edi, SIZEOF_XMMWORD-1 |
260 jnz short .out1 | 260 jnz short .out1 |
261 ; --(aligned)------------------- | 261 ; --(aligned)------------------- |
262 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA | 262 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
263 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD | 263 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
264 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF | 264 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF |
265 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr | |
266 jmp short .out0 | 265 jmp short .out0 |
267 .out1: ; --(unaligned)----------------- | 266 .out1: ; --(unaligned)----------------- |
268 » pcmpeqb xmmH,xmmH» » » ; xmmH=(all 1's) | 267 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
269 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA | 268 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
270 » add» edi, byte SIZEOF_XMMWORD» ; outptr | 269 » movdqu» XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF |
271 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [edi], xmmD | |
272 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
273 » maskmovdqu xmmF,xmmH» » » ; movntdqu XMMWORD [edi], xmmF | |
274 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
275 .out0: | 270 .out0: |
| 271 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
276 sub ecx, byte SIZEOF_XMMWORD | 272 sub ecx, byte SIZEOF_XMMWORD |
277 jz near .nextrow | 273 jz near .nextrow |
278 | 274 |
279 add esi, byte SIZEOF_XMMWORD ; inptr0 | 275 add esi, byte SIZEOF_XMMWORD ; inptr0 |
280 add ebx, byte SIZEOF_XMMWORD ; inptr1 | 276 add ebx, byte SIZEOF_XMMWORD ; inptr1 |
281 add edx, byte SIZEOF_XMMWORD ; inptr2 | 277 add edx, byte SIZEOF_XMMWORD ; inptr2 |
282 jmp near .columnloop | 278 jmp near .columnloop |
283 alignx 16,7 | 279 alignx 16,7 |
284 | 280 |
285 .column_st32: | 281 .column_st32: |
286 pcmpeqb xmmH,xmmH ; xmmH=(all 1's) | |
287 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE | 282 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE |
288 cmp ecx, byte 2*SIZEOF_XMMWORD | 283 cmp ecx, byte 2*SIZEOF_XMMWORD |
289 jb short .column_st16 | 284 jb short .column_st16 |
290 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA | 285 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
291 » add» edi, byte SIZEOF_XMMWORD» ; outptr | 286 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
292 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [edi], xmmD | 287 » add» edi, byte 2*SIZEOF_XMMWORD» ; outptr |
293 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
294 movdqa xmmA,xmmF | 288 movdqa xmmA,xmmF |
295 sub ecx, byte 2*SIZEOF_XMMWORD | 289 sub ecx, byte 2*SIZEOF_XMMWORD |
296 jmp short .column_st15 | 290 jmp short .column_st15 |
297 .column_st16: | 291 .column_st16: |
298 cmp ecx, byte SIZEOF_XMMWORD | 292 cmp ecx, byte SIZEOF_XMMWORD |
299 jb short .column_st15 | 293 jb short .column_st15 |
300 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA | 294 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
301 add edi, byte SIZEOF_XMMWORD ; outptr | 295 add edi, byte SIZEOF_XMMWORD ; outptr |
302 movdqa xmmA,xmmD | 296 movdqa xmmA,xmmD |
303 sub ecx, byte SIZEOF_XMMWORD | 297 sub ecx, byte SIZEOF_XMMWORD |
304 .column_st15: | 298 .column_st15: |
305 %ifdef STRICT_MEMORY_ACCESS | |
306 ; Store the lower 8 bytes of xmmA to the output when it has enough | 299 ; Store the lower 8 bytes of xmmA to the output when it has enough |
307 ; space. | 300 ; space. |
308 cmp ecx, byte SIZEOF_MMWORD | 301 cmp ecx, byte SIZEOF_MMWORD |
309 jb short .column_st7 | 302 jb short .column_st7 |
310 movq MMWORD [edi], xmmA | 303 movq MMWORD [edi], xmmA |
311 add edi, byte SIZEOF_MMWORD | 304 add edi, byte SIZEOF_MMWORD |
312 sub ecx, byte SIZEOF_MMWORD | 305 sub ecx, byte SIZEOF_MMWORD |
313 psrldq xmmA, SIZEOF_MMWORD | 306 psrldq xmmA, SIZEOF_MMWORD |
314 .column_st7: | 307 .column_st7: |
315 ; Store the lower 4 bytes of xmmA to the output when it has enough | 308 ; Store the lower 4 bytes of xmmA to the output when it has enough |
(...skipping 13 matching lines...) Expand all Loading... |
329 mov WORD [edi], ax | 322 mov WORD [edi], ax |
330 add edi, byte SIZEOF_WORD | 323 add edi, byte SIZEOF_WORD |
331 sub ecx, byte SIZEOF_WORD | 324 sub ecx, byte SIZEOF_WORD |
332 shr eax, 16 | 325 shr eax, 16 |
333 .column_st1: | 326 .column_st1: |
334 ; Store the lower 1 byte of eax to the output when it has enough | 327 ; Store the lower 1 byte of eax to the output when it has enough |
335 ; space. | 328 ; space. |
336 test ecx, ecx | 329 test ecx, ecx |
337 jz short .nextrow | 330 jz short .nextrow |
338 mov BYTE [edi], al | 331 mov BYTE [edi], al |
339 %else | |
340 mov eax,ecx | |
341 xor ecx, byte 0x0F | |
342 shl ecx, 2 | |
343 movd xmmB,ecx | |
344 psrlq xmmH,4 | |
345 pcmpeqb xmmE,xmmE | |
346 psrlq xmmH,xmmB | |
347 psrlq xmmE,xmmB | |
348 punpcklbw xmmE,xmmH | |
349 ; ---------------- | |
350 mov ecx,edi | |
351 and ecx, byte SIZEOF_XMMWORD-1 | |
352 jz short .adj0 | |
353 add eax,ecx | |
354 cmp eax, byte SIZEOF_XMMWORD | |
355 ja short .adj0 | |
356 and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary | |
357 shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx | |
358 movdqa xmmG,xmmA | |
359 movdqa xmmC,xmmE | |
360 pslldq xmmA, SIZEOF_XMMWORD/2 | |
361 pslldq xmmE, SIZEOF_XMMWORD/2 | |
362 movd xmmD,ecx | |
363 sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT | |
364 jb short .adj1 | |
365 movd xmmF,ecx | |
366 psllq xmmA,xmmF | |
367 psllq xmmE,xmmF | |
368 jmp short .adj0 | |
369 .adj1: neg ecx | |
370 movd xmmF,ecx | |
371 psrlq xmmA,xmmF | |
372 psrlq xmmE,xmmF | |
373 psllq xmmG,xmmD | |
374 psllq xmmC,xmmD | |
375 por xmmA,xmmG | |
376 por xmmE,xmmC | |
377 .adj0: ; ---------------- | |
378 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA | |
379 %endif ; STRICT_MEMORY_ACCESS ; --------------- | |
380 | 332 |
381 %else ; RGB_PIXELSIZE == 4 ; ----------- | 333 %else ; RGB_PIXELSIZE == 4 ; ----------- |
382 | 334 |
383 %ifdef RGBX_FILLER_0XFF | 335 %ifdef RGBX_FILLER_0XFF |
384 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) | 336 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) |
385 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) | 337 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) |
386 %else | 338 %else |
387 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) | 339 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) |
388 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) | 340 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) |
389 %endif | 341 %endif |
(...skipping 24 matching lines...) Expand all Loading... |
414 cmp ecx, byte SIZEOF_XMMWORD | 366 cmp ecx, byte SIZEOF_XMMWORD |
415 jb short .column_st32 | 367 jb short .column_st32 |
416 | 368 |
417 test edi, SIZEOF_XMMWORD-1 | 369 test edi, SIZEOF_XMMWORD-1 |
418 jnz short .out1 | 370 jnz short .out1 |
419 ; --(aligned)------------------- | 371 ; --(aligned)------------------- |
420 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA | 372 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
421 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD | 373 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
422 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC | 374 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC |
423 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH | 375 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH |
424 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr | |
425 jmp short .out0 | 376 jmp short .out0 |
426 .out1: ; --(unaligned)----------------- | 377 .out1: ; --(unaligned)----------------- |
427 » pcmpeqb xmmE,xmmE» » » ; xmmE=(all 1's) | 378 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
428 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA | 379 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
429 » add» edi, byte SIZEOF_XMMWORD» ; outptr | 380 » movdqu» XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC |
430 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [edi], xmmD | 381 » movdqu» XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH |
431 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
432 » maskmovdqu xmmC,xmmE» » » ; movntdqu XMMWORD [edi], xmmC | |
433 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
434 » maskmovdqu xmmH,xmmE» » » ; movntdqu XMMWORD [edi], xmmH | |
435 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
436 .out0: | 382 .out0: |
| 383 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
437 sub ecx, byte SIZEOF_XMMWORD | 384 sub ecx, byte SIZEOF_XMMWORD |
438 jz near .nextrow | 385 jz near .nextrow |
439 | 386 |
440 add esi, byte SIZEOF_XMMWORD ; inptr0 | 387 add esi, byte SIZEOF_XMMWORD ; inptr0 |
441 add ebx, byte SIZEOF_XMMWORD ; inptr1 | 388 add ebx, byte SIZEOF_XMMWORD ; inptr1 |
442 add edx, byte SIZEOF_XMMWORD ; inptr2 | 389 add edx, byte SIZEOF_XMMWORD ; inptr2 |
443 jmp near .columnloop | 390 jmp near .columnloop |
444 alignx 16,7 | 391 alignx 16,7 |
445 | 392 |
446 .column_st32: | 393 .column_st32: |
447 pcmpeqb xmmE,xmmE ; xmmE=(all 1's) | |
448 cmp ecx, byte SIZEOF_XMMWORD/2 | 394 cmp ecx, byte SIZEOF_XMMWORD/2 |
449 jb short .column_st16 | 395 jb short .column_st16 |
450 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA | 396 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
451 » add» edi, byte SIZEOF_XMMWORD» ; outptr | 397 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD |
452 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [edi], xmmD | 398 » add» edi, byte 2*SIZEOF_XMMWORD» ; outptr |
453 » add» edi, byte SIZEOF_XMMWORD» ; outptr | |
454 movdqa xmmA,xmmC | 399 movdqa xmmA,xmmC |
455 movdqa xmmD,xmmH | 400 movdqa xmmD,xmmH |
456 sub ecx, byte SIZEOF_XMMWORD/2 | 401 sub ecx, byte SIZEOF_XMMWORD/2 |
457 .column_st16: | 402 .column_st16: |
458 cmp ecx, byte SIZEOF_XMMWORD/4 | 403 cmp ecx, byte SIZEOF_XMMWORD/4 |
459 jb short .column_st15 | 404 jb short .column_st15 |
460 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA | 405 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA |
461 add edi, byte SIZEOF_XMMWORD ; outptr | 406 add edi, byte SIZEOF_XMMWORD ; outptr |
462 movdqa xmmA,xmmD | 407 movdqa xmmA,xmmD |
463 sub ecx, byte SIZEOF_XMMWORD/4 | 408 sub ecx, byte SIZEOF_XMMWORD/4 |
464 .column_st15: | 409 .column_st15: |
465 %ifdef STRICT_MEMORY_ACCESS | |
466 ; Store two pixels (8 bytes) of xmmA to the output when it has enough | 410 ; Store two pixels (8 bytes) of xmmA to the output when it has enough |
467 ; space. | 411 ; space. |
468 cmp ecx, byte SIZEOF_XMMWORD/8 | 412 cmp ecx, byte SIZEOF_XMMWORD/8 |
469 jb short .column_st7 | 413 jb short .column_st7 |
470 movq MMWORD [edi], xmmA | 414 movq MMWORD [edi], xmmA |
471 add edi, byte SIZEOF_XMMWORD/8*4 | 415 add edi, byte SIZEOF_XMMWORD/8*4 |
472 sub ecx, byte SIZEOF_XMMWORD/8 | 416 sub ecx, byte SIZEOF_XMMWORD/8 |
473 psrldq xmmA, SIZEOF_XMMWORD/8*4 | 417 psrldq xmmA, SIZEOF_XMMWORD/8*4 |
474 .column_st7: | 418 .column_st7: |
475 ; Store one pixel (4 bytes) of xmmA to the output when it has enough | 419 ; Store one pixel (4 bytes) of xmmA to the output when it has enough |
476 ; space. | 420 ; space. |
477 test ecx, ecx | 421 test ecx, ecx |
478 jz short .nextrow | 422 jz short .nextrow |
479 movd DWORD [edi], xmmA | 423 movd DWORD [edi], xmmA |
480 %else | |
481 cmp ecx, byte SIZEOF_XMMWORD/16 | |
482 jb short .nextrow | |
483 mov eax,ecx | |
484 xor ecx, byte 0x03 | |
485 inc ecx | |
486 shl ecx, 4 | |
487 movd xmmF,ecx | |
488 psrlq xmmE,xmmF | |
489 punpcklbw xmmE,xmmE | |
490 ; ---------------- | |
491 mov ecx,edi | |
492 and ecx, byte SIZEOF_XMMWORD-1 | |
493 jz short .adj0 | |
494 lea eax, [ecx+eax*4] ; RGB_PIXELSIZE | |
495 cmp eax, byte SIZEOF_XMMWORD | |
496 ja short .adj0 | |
497 and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary | |
498 shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx | |
499 movdqa xmmB,xmmA | |
500 movdqa xmmG,xmmE | |
501 pslldq xmmA, SIZEOF_XMMWORD/2 | |
502 pslldq xmmE, SIZEOF_XMMWORD/2 | |
503 movd xmmC,ecx | |
504 sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT | |
505 jb short .adj1 | |
506 movd xmmH,ecx | |
507 psllq xmmA,xmmH | |
508 psllq xmmE,xmmH | |
509 jmp short .adj0 | |
510 .adj1: neg ecx | |
511 movd xmmH,ecx | |
512 psrlq xmmA,xmmH | |
513 psrlq xmmE,xmmH | |
514 psllq xmmB,xmmC | |
515 psllq xmmG,xmmC | |
516 por xmmA,xmmB | |
517 por xmmE,xmmG | |
518 .adj0: ; ---------------- | |
519 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA | |
520 %endif ; STRICT_MEMORY_ACCESS ; --------------- | |
521 | 424 |
522 %endif ; RGB_PIXELSIZE ; --------------- | 425 %endif ; RGB_PIXELSIZE ; --------------- |
523 | 426 |
524 alignx 16,7 | 427 alignx 16,7 |
525 | 428 |
526 .nextrow: | 429 .nextrow: |
527 pop ecx | 430 pop ecx |
528 pop esi | 431 pop esi |
529 pop ebx | 432 pop ebx |
530 pop edx | 433 pop edx |
(...skipping 16 matching lines...) Expand all Loading... |
547 ; pop ecx ; need not be preserved | 450 ; pop ecx ; need not be preserved |
548 pop ebx | 451 pop ebx |
549 mov esp,ebp ; esp <- aligned ebp | 452 mov esp,ebp ; esp <- aligned ebp |
550 pop esp ; esp <- original ebp | 453 pop esp ; esp <- original ebp |
551 pop ebp | 454 pop ebp |
552 ret | 455 ret |
553 | 456 |
554 ; For some reason, the OS X linker does not honor the request to align the | 457 ; For some reason, the OS X linker does not honor the request to align the |
555 ; segment unless we do this. | 458 ; segment unless we do this. |
556 align 16 | 459 align 16 |
OLD | NEW |