OLD | NEW |
1 ; | 1 ; |
2 ; jdclrss2-64.asm - colorspace conversion (64-bit SSE2) | 2 ; jdclrss2-64.asm - colorspace conversion (64-bit SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
11 ; | 11 ; |
12 ; This file should be assembled with NASM (Netwide Assembler), | 12 ; This file should be assembled with NASM (Netwide Assembler), |
13 ; can *not* be assembled with Microsoft's MASM or any compatible | 13 ; can *not* be assembled with Microsoft's MASM or any compatible |
14 ; assembler (including Borland's Turbo Assembler). | 14 ; assembler (including Borland's Turbo Assembler). |
(...skipping 229 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
244 | 244 |
245 cmp rcx, byte SIZEOF_XMMWORD | 245 cmp rcx, byte SIZEOF_XMMWORD |
246 jb short .column_st32 | 246 jb short .column_st32 |
247 | 247 |
248 test rdi, SIZEOF_XMMWORD-1 | 248 test rdi, SIZEOF_XMMWORD-1 |
249 jnz short .out1 | 249 jnz short .out1 |
250 ; --(aligned)------------------- | 250 ; --(aligned)------------------- |
251 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA | 251 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
252 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD | 252 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
253 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF | 253 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF |
254 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr | |
255 jmp short .out0 | 254 jmp short .out0 |
256 .out1: ; --(unaligned)----------------- | 255 .out1: ; --(unaligned)----------------- |
257 » pcmpeqb xmmH,xmmH» » » ; xmmH=(all 1's) | 256 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
258 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA | 257 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
259 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | 258 » movdqu» XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF |
260 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [rdi], xmmD | |
261 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
262 » maskmovdqu xmmF,xmmH» » » ; movntdqu XMMWORD [rdi], xmmF | |
263 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
264 .out0: | 259 .out0: |
| 260 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
265 sub rcx, byte SIZEOF_XMMWORD | 261 sub rcx, byte SIZEOF_XMMWORD |
266 jz near .nextrow | 262 jz near .nextrow |
267 | 263 |
268 add rsi, byte SIZEOF_XMMWORD ; inptr0 | 264 add rsi, byte SIZEOF_XMMWORD ; inptr0 |
269 add rbx, byte SIZEOF_XMMWORD ; inptr1 | 265 add rbx, byte SIZEOF_XMMWORD ; inptr1 |
270 add rdx, byte SIZEOF_XMMWORD ; inptr2 | 266 add rdx, byte SIZEOF_XMMWORD ; inptr2 |
271 jmp near .columnloop | 267 jmp near .columnloop |
272 | 268 |
273 .column_st32: | 269 .column_st32: |
274 pcmpeqb xmmH,xmmH ; xmmH=(all 1's) | |
275 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE | 270 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE |
276 cmp rcx, byte 2*SIZEOF_XMMWORD | 271 cmp rcx, byte 2*SIZEOF_XMMWORD |
277 jb short .column_st16 | 272 jb short .column_st16 |
278 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA | 273 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
279 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | 274 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
280 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [rdi], xmmD | 275 » add» rdi, byte 2*SIZEOF_XMMWORD» ; outptr |
281 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
282 movdqa xmmA,xmmF | 276 movdqa xmmA,xmmF |
283 sub rcx, byte 2*SIZEOF_XMMWORD | 277 sub rcx, byte 2*SIZEOF_XMMWORD |
284 jmp short .column_st15 | 278 jmp short .column_st15 |
285 .column_st16: | 279 .column_st16: |
286 cmp rcx, byte SIZEOF_XMMWORD | 280 cmp rcx, byte SIZEOF_XMMWORD |
287 jb short .column_st15 | 281 jb short .column_st15 |
288 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA | 282 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
289 add rdi, byte SIZEOF_XMMWORD ; outptr | 283 add rdi, byte SIZEOF_XMMWORD ; outptr |
290 movdqa xmmA,xmmD | 284 movdqa xmmA,xmmD |
291 sub rcx, byte SIZEOF_XMMWORD | 285 sub rcx, byte SIZEOF_XMMWORD |
292 .column_st15: | 286 .column_st15: |
293 %ifdef STRICT_MEMORY_ACCESS | |
294 ; Store the lower 8 bytes of xmmA to the output when it has enough | 287 ; Store the lower 8 bytes of xmmA to the output when it has enough |
295 ; space. | 288 ; space. |
296 cmp rcx, byte SIZEOF_MMWORD | 289 cmp rcx, byte SIZEOF_MMWORD |
297 jb short .column_st7 | 290 jb short .column_st7 |
298 movq MMWORD [rdi], xmmA | 291 movq MMWORD [rdi], xmmA |
299 add rdi, byte SIZEOF_MMWORD | 292 add rdi, byte SIZEOF_MMWORD |
300 sub rcx, byte SIZEOF_MMWORD | 293 sub rcx, byte SIZEOF_MMWORD |
301 psrldq xmmA, SIZEOF_MMWORD | 294 psrldq xmmA, SIZEOF_MMWORD |
302 .column_st7: | 295 .column_st7: |
303 ; Store the lower 4 bytes of xmmA to the output when it has enough | 296 ; Store the lower 4 bytes of xmmA to the output when it has enough |
(...skipping 13 matching lines...) Expand all Loading... |
317 mov WORD [rdi], ax | 310 mov WORD [rdi], ax |
318 add rdi, byte SIZEOF_WORD | 311 add rdi, byte SIZEOF_WORD |
319 sub rcx, byte SIZEOF_WORD | 312 sub rcx, byte SIZEOF_WORD |
320 shr rax, 16 | 313 shr rax, 16 |
321 .column_st1: | 314 .column_st1: |
322 ; Store the lower 1 byte of rax to the output when it has enough | 315 ; Store the lower 1 byte of rax to the output when it has enough |
323 ; space. | 316 ; space. |
324 test rcx, rcx | 317 test rcx, rcx |
325 jz short .nextrow | 318 jz short .nextrow |
326 mov BYTE [rdi], al | 319 mov BYTE [rdi], al |
327 %else | |
328 mov rax,rcx | |
329 xor rcx, byte 0x0F | |
330 shl rcx, 2 | |
331 movd xmmB,ecx | |
332 psrlq xmmH,4 | |
333 pcmpeqb xmmE,xmmE | |
334 psrlq xmmH,xmmB | |
335 psrlq xmmE,xmmB | |
336 punpcklbw xmmE,xmmH | |
337 ; ---------------- | |
338 mov rcx,rdi | |
339 and rcx, byte SIZEOF_XMMWORD-1 | |
340 jz short .adj0 | |
341 add rax,rcx | |
342 cmp rax, byte SIZEOF_XMMWORD | |
343 ja short .adj0 | |
344 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary | |
345 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx | |
346 movdqa xmmG,xmmA | |
347 movdqa xmmC,xmmE | |
348 pslldq xmmA, SIZEOF_XMMWORD/2 | |
349 pslldq xmmE, SIZEOF_XMMWORD/2 | |
350 movd xmmD,ecx | |
351 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT | |
352 jb short .adj1 | |
353 movd xmmF,ecx | |
354 psllq xmmA,xmmF | |
355 psllq xmmE,xmmF | |
356 jmp short .adj0 | |
357 .adj1: neg ecx | |
358 movd xmmF,ecx | |
359 psrlq xmmA,xmmF | |
360 psrlq xmmE,xmmF | |
361 psllq xmmG,xmmD | |
362 psllq xmmC,xmmD | |
363 por xmmA,xmmG | |
364 por xmmE,xmmC | |
365 .adj0: ; ---------------- | |
366 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA | |
367 %endif ; STRICT_MEMORY_ACCESS ; --------------- | |
368 | 320 |
369 %else ; RGB_PIXELSIZE == 4 ; ----------- | 321 %else ; RGB_PIXELSIZE == 4 ; ----------- |
370 | 322 |
371 %ifdef RGBX_FILLER_0XFF | 323 %ifdef RGBX_FILLER_0XFF |
372 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) | 324 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) |
373 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) | 325 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) |
374 %else | 326 %else |
375 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) | 327 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) |
376 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) | 328 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) |
377 %endif | 329 %endif |
(...skipping 24 matching lines...) Expand all Loading... |
402 cmp rcx, byte SIZEOF_XMMWORD | 354 cmp rcx, byte SIZEOF_XMMWORD |
403 jb short .column_st32 | 355 jb short .column_st32 |
404 | 356 |
405 test rdi, SIZEOF_XMMWORD-1 | 357 test rdi, SIZEOF_XMMWORD-1 |
406 jnz short .out1 | 358 jnz short .out1 |
407 ; --(aligned)------------------- | 359 ; --(aligned)------------------- |
408 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA | 360 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
409 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD | 361 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
410 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC | 362 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC |
411 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH | 363 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH |
412 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr | |
413 jmp short .out0 | 364 jmp short .out0 |
414 .out1: ; --(unaligned)----------------- | 365 .out1: ; --(unaligned)----------------- |
415 » pcmpeqb xmmE,xmmE» » » ; xmmE=(all 1's) | 366 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
416 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA | 367 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
417 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | 368 » movdqu» XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC |
418 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [rdi], xmmD | 369 » movdqu» XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH |
419 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
420 » maskmovdqu xmmC,xmmE» » » ; movntdqu XMMWORD [rdi], xmmC | |
421 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
422 » maskmovdqu xmmH,xmmE» » » ; movntdqu XMMWORD [rdi], xmmH | |
423 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
424 .out0: | 370 .out0: |
| 371 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr |
425 sub rcx, byte SIZEOF_XMMWORD | 372 sub rcx, byte SIZEOF_XMMWORD |
426 jz near .nextrow | 373 jz near .nextrow |
427 | 374 |
428 add rsi, byte SIZEOF_XMMWORD ; inptr0 | 375 add rsi, byte SIZEOF_XMMWORD ; inptr0 |
429 add rbx, byte SIZEOF_XMMWORD ; inptr1 | 376 add rbx, byte SIZEOF_XMMWORD ; inptr1 |
430 add rdx, byte SIZEOF_XMMWORD ; inptr2 | 377 add rdx, byte SIZEOF_XMMWORD ; inptr2 |
431 jmp near .columnloop | 378 jmp near .columnloop |
432 | 379 |
433 .column_st32: | 380 .column_st32: |
434 pcmpeqb xmmE,xmmE ; xmmE=(all 1's) | |
435 cmp rcx, byte SIZEOF_XMMWORD/2 | 381 cmp rcx, byte SIZEOF_XMMWORD/2 |
436 jb short .column_st16 | 382 jb short .column_st16 |
437 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA | 383 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
438 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | 384 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD |
439 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [rdi], xmmD | 385 » add» rdi, byte 2*SIZEOF_XMMWORD» ; outptr |
440 » add» rdi, byte SIZEOF_XMMWORD» ; outptr | |
441 movdqa xmmA,xmmC | 386 movdqa xmmA,xmmC |
442 movdqa xmmD,xmmH | 387 movdqa xmmD,xmmH |
443 sub rcx, byte SIZEOF_XMMWORD/2 | 388 sub rcx, byte SIZEOF_XMMWORD/2 |
444 .column_st16: | 389 .column_st16: |
445 cmp rcx, byte SIZEOF_XMMWORD/4 | 390 cmp rcx, byte SIZEOF_XMMWORD/4 |
446 jb short .column_st15 | 391 jb short .column_st15 |
447 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA | 392 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA |
448 add rdi, byte SIZEOF_XMMWORD ; outptr | 393 add rdi, byte SIZEOF_XMMWORD ; outptr |
449 movdqa xmmA,xmmD | 394 movdqa xmmA,xmmD |
450 sub rcx, byte SIZEOF_XMMWORD/4 | 395 sub rcx, byte SIZEOF_XMMWORD/4 |
451 .column_st15: | 396 .column_st15: |
452 %ifdef STRICT_MEMORY_ACCESS | |
453 ; Store two pixels (8 bytes) of xmmA to the output when it has enough | 397 ; Store two pixels (8 bytes) of xmmA to the output when it has enough |
454 ; space. | 398 ; space. |
455 cmp rcx, byte SIZEOF_XMMWORD/8 | 399 cmp rcx, byte SIZEOF_XMMWORD/8 |
456 jb short .column_st7 | 400 jb short .column_st7 |
457 movq MMWORD [rdi], xmmA | 401 movq MMWORD [rdi], xmmA |
458 add rdi, byte SIZEOF_XMMWORD/8*4 | 402 add rdi, byte SIZEOF_XMMWORD/8*4 |
459 sub rcx, byte SIZEOF_XMMWORD/8 | 403 sub rcx, byte SIZEOF_XMMWORD/8 |
460 psrldq xmmA, SIZEOF_XMMWORD/8*4 | 404 psrldq xmmA, SIZEOF_XMMWORD/8*4 |
461 .column_st7: | 405 .column_st7: |
462 ; Store one pixel (4 bytes) of xmmA to the output when it has enough | 406 ; Store one pixel (4 bytes) of xmmA to the output when it has enough |
463 ; space. | 407 ; space. |
464 test rcx, rcx | 408 test rcx, rcx |
465 jz short .nextrow | 409 jz short .nextrow |
466 movd DWORD [rdi], xmmA | 410 movd DWORD [rdi], xmmA |
467 %else | |
468 cmp rcx, byte SIZEOF_XMMWORD/16 | |
469 jb near .nextrow | |
470 mov rax,rcx | |
471 xor rcx, byte 0x03 | |
472 inc rcx | |
473 shl rcx, 4 | |
474 movd xmmF,ecx | |
475 psrlq xmmE,xmmF | |
476 punpcklbw xmmE,xmmE | |
477 ; ---------------- | |
478 mov rcx,rdi | |
479 and rcx, byte SIZEOF_XMMWORD-1 | |
480 jz short .adj0 | |
481 lea rax, [rcx+rax*4] ; RGB_PIXELSIZE | |
482 cmp rax, byte SIZEOF_XMMWORD | |
483 ja short .adj0 | |
484 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary | |
485 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx | |
486 movdqa xmmB,xmmA | |
487 movdqa xmmG,xmmE | |
488 pslldq xmmA, SIZEOF_XMMWORD/2 | |
489 pslldq xmmE, SIZEOF_XMMWORD/2 | |
490 movd xmmC,ecx | |
491 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT | |
492 jb short .adj1 | |
493 movd xmmH,ecx | |
494 psllq xmmA,xmmH | |
495 psllq xmmE,xmmH | |
496 jmp short .adj0 | |
497 .adj1: neg rcx | |
498 movd xmmH,ecx | |
499 psrlq xmmA,xmmH | |
500 psrlq xmmE,xmmH | |
501 psllq xmmB,xmmC | |
502 psllq xmmG,xmmC | |
503 por xmmA,xmmB | |
504 por xmmE,xmmG | |
505 .adj0: ; ---------------- | |
506 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA | |
507 %endif ; STRICT_MEMORY_ACCESS ; --------------- | |
508 | 411 |
509 %endif ; RGB_PIXELSIZE ; --------------- | 412 %endif ; RGB_PIXELSIZE ; --------------- |
510 | 413 |
511 .nextrow: | 414 .nextrow: |
512 pop rcx | 415 pop rcx |
513 pop rsi | 416 pop rsi |
514 pop rbx | 417 pop rbx |
515 pop rdx | 418 pop rdx |
516 pop rdi | 419 pop rdi |
517 pop rax | 420 pop rax |
(...skipping 11 matching lines...) Expand all Loading... |
529 pop rbx | 432 pop rbx |
530 uncollect_args | 433 uncollect_args |
531 mov rsp,rbp ; rsp <- aligned rbp | 434 mov rsp,rbp ; rsp <- aligned rbp |
532 pop rsp ; rsp <- original rbp | 435 pop rsp ; rsp <- original rbp |
533 pop rbp | 436 pop rbp |
534 ret | 437 ret |
535 | 438 |
536 ; For some reason, the OS X linker does not honor the request to align the | 439 ; For some reason, the OS X linker does not honor the request to align the |
537 ; segment unless we do this. | 440 ; segment unless we do this. |
538 align 16 | 441 align 16 |
OLD | NEW |