Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(581)

Side by Side Diff: simd/jdmrgss2-64.asm

Issue 10700197: Update libjpeg-turbo to r856. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libjpeg_turbo/
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « simd/jdmrgss2.asm ('k') | simd/jsimdcfg.inc.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 ; 1 ;
2 ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2) 2 ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
3 ; 3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright 2009 D. R. Commander 5 ; Copyright 2009 D. R. Commander
6 ; 6 ;
7 ; Based on 7 ; Based on
8 ; x86 SIMD extension for IJG JPEG library 8 ; x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; 11 ;
12 ; This file should be assembled with NASM (Netwide Assembler), 12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible 13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler). 14 ; assembler (including Borland's Turbo Assembler).
15 ; NASM is available from http://nasm.sourceforge.net/ for 15 ; NASM is available from http://nasm.sourceforge.net/ or
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
17 ; 17 ;
18 ; [TAB8] 18 ; [TAB8]
19 19
20 %include "jcolsamp.inc" 20 %include "jcolsamp.inc"
21 21
22 ; -------------------------------------------------------------------------- 22 ; --------------------------------------------------------------------------
23 ; 23 ;
24 ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 24 ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
25 ; 25 ;
(...skipping 219 matching lines...) Expand 10 before | Expand all | Expand 10 after
245 245
246 cmp rcx, byte SIZEOF_XMMWORD 246 cmp rcx, byte SIZEOF_XMMWORD
247 jb short .column_st32 247 jb short .column_st32
248 248
249 test rdi, SIZEOF_XMMWORD-1 249 test rdi, SIZEOF_XMMWORD-1
250 jnz short .out1 250 jnz short .out1
251 ; --(aligned)------------------- 251 ; --(aligned)-------------------
252 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 252 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
253 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 253 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
254 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 254 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
255 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
256 jmp short .out0 255 jmp short .out0
257 .out1: ; --(unaligned)----------------- 256 .out1: ; --(unaligned)-----------------
258 » pcmpeqb xmmH,xmmH» » » ; xmmH=(all 1's) 257 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
259 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA 258 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
260 » add» rdi, byte SIZEOF_XMMWORD» ; outptr 259 » movdqu» XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
261 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [rdi], xmmD
262 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
263 » maskmovdqu xmmF,xmmH» » » ; movntdqu XMMWORD [rdi], xmmF
264 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
265 .out0: 260 .out0:
261 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
266 sub rcx, byte SIZEOF_XMMWORD 262 sub rcx, byte SIZEOF_XMMWORD
267 jz near .endcolumn 263 jz near .endcolumn
268 264
269 add rsi, byte SIZEOF_XMMWORD ; inptr0 265 add rsi, byte SIZEOF_XMMWORD ; inptr0
270 dec al ; Yctr 266 dec al ; Yctr
271 jnz near .Yloop_2nd 267 jnz near .Yloop_2nd
272 268
273 add rbx, byte SIZEOF_XMMWORD ; inptr1 269 add rbx, byte SIZEOF_XMMWORD ; inptr1
274 add rdx, byte SIZEOF_XMMWORD ; inptr2 270 add rdx, byte SIZEOF_XMMWORD ; inptr2
275 jmp near .columnloop 271 jmp near .columnloop
276 272
277 .column_st32: 273 .column_st32:
278 pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
279 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE 274 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
280 cmp rcx, byte 2*SIZEOF_XMMWORD 275 cmp rcx, byte 2*SIZEOF_XMMWORD
281 jb short .column_st16 276 jb short .column_st16
282 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA 277 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
283 » add» rdi, byte SIZEOF_XMMWORD» ; outptr 278 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
284 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [rdi], xmmD 279 » add» rdi, byte 2*SIZEOF_XMMWORD» ; outptr
285 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
286 movdqa xmmA,xmmF 280 movdqa xmmA,xmmF
287 sub rcx, byte 2*SIZEOF_XMMWORD 281 sub rcx, byte 2*SIZEOF_XMMWORD
288 jmp short .column_st15 282 jmp short .column_st15
289 .column_st16: 283 .column_st16:
290 cmp rcx, byte SIZEOF_XMMWORD 284 cmp rcx, byte SIZEOF_XMMWORD
291 jb short .column_st15 285 jb short .column_st15
292 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA 286 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
293 add rdi, byte SIZEOF_XMMWORD ; outptr 287 add rdi, byte SIZEOF_XMMWORD ; outptr
294 movdqa xmmA,xmmD 288 movdqa xmmA,xmmD
295 sub rcx, byte SIZEOF_XMMWORD 289 sub rcx, byte SIZEOF_XMMWORD
296 .column_st15: 290 .column_st15:
297 %ifdef STRICT_MEMORY_ACCESS
298 ; Store the lower 8 bytes of xmmA to the output when it has enough 291 ; Store the lower 8 bytes of xmmA to the output when it has enough
299 ; space. 292 ; space.
300 cmp rcx, byte SIZEOF_MMWORD 293 cmp rcx, byte SIZEOF_MMWORD
301 jb short .column_st7 294 jb short .column_st7
302 movq MMWORD [rdi], xmmA 295 movq MMWORD [rdi], xmmA
303 add rdi, byte SIZEOF_MMWORD 296 add rdi, byte SIZEOF_MMWORD
304 sub rcx, byte SIZEOF_MMWORD 297 sub rcx, byte SIZEOF_MMWORD
305 psrldq xmmA, SIZEOF_MMWORD 298 psrldq xmmA, SIZEOF_MMWORD
306 .column_st7: 299 .column_st7:
307 ; Store the lower 4 bytes of xmmA to the output when it has enough 300 ; Store the lower 4 bytes of xmmA to the output when it has enough
(...skipping 13 matching lines...) Expand all
321 mov WORD [rdi], ax 314 mov WORD [rdi], ax
322 add rdi, byte SIZEOF_WORD 315 add rdi, byte SIZEOF_WORD
323 sub rcx, byte SIZEOF_WORD 316 sub rcx, byte SIZEOF_WORD
324 shr rax, 16 317 shr rax, 16
325 .column_st1: 318 .column_st1:
326 ; Store the lower 1 byte of rax to the output when it has enough 319 ; Store the lower 1 byte of rax to the output when it has enough
327 ; space. 320 ; space.
328 test rcx, rcx 321 test rcx, rcx
329 jz short .endcolumn 322 jz short .endcolumn
330 mov BYTE [rdi], al 323 mov BYTE [rdi], al
331 %else
332 mov rax,rcx
333 xor rcx, byte 0x0F
334 shl rcx, 2
335 movd xmmB,ecx
336 psrlq xmmH,4
337 pcmpeqb xmmE,xmmE
338 psrlq xmmH,xmmB
339 psrlq xmmE,xmmB
340 punpcklbw xmmE,xmmH
341 ; ----------------
342 mov rcx,rdi
343 and rcx, byte SIZEOF_XMMWORD-1
344 jz short .adj0
345 add rax,rcx
346 cmp rax, byte SIZEOF_XMMWORD
347 ja short .adj0
348 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
349 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
350 movdqa xmmG,xmmA
351 movdqa xmmC,xmmE
352 pslldq xmmA, SIZEOF_XMMWORD/2
353 pslldq xmmE, SIZEOF_XMMWORD/2
354 movd xmmD,ecx
355 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
356 jb short .adj1
357 movd xmmF,ecx
358 psllq xmmA,xmmF
359 psllq xmmE,xmmF
360 jmp short .adj0
361 .adj1: neg rcx
362 movd xmmF,ecx
363 psrlq xmmA,xmmF
364 psrlq xmmE,xmmF
365 psllq xmmG,xmmD
366 psllq xmmC,xmmD
367 por xmmA,xmmG
368 por xmmE,xmmC
369 .adj0: ; ----------------
370 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
371 %endif ; STRICT_MEMORY_ACCESS ; ---------------
372 324
373 %else ; RGB_PIXELSIZE == 4 ; ----------- 325 %else ; RGB_PIXELSIZE == 4 ; -----------
374 326
375 %ifdef RGBX_FILLER_0XFF 327 %ifdef RGBX_FILLER_0XFF
376 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 328 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
377 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 329 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
378 %else 330 %else
379 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 331 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
380 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 332 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
381 %endif 333 %endif
(...skipping 24 matching lines...) Expand all
406 cmp rcx, byte SIZEOF_XMMWORD 358 cmp rcx, byte SIZEOF_XMMWORD
407 jb short .column_st32 359 jb short .column_st32
408 360
409 test rdi, SIZEOF_XMMWORD-1 361 test rdi, SIZEOF_XMMWORD-1
410 jnz short .out1 362 jnz short .out1
411 ; --(aligned)------------------- 363 ; --(aligned)-------------------
412 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 364 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
413 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 365 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
414 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 366 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
415 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 367 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
416 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
417 jmp short .out0 368 jmp short .out0
418 .out1: ; --(unaligned)----------------- 369 .out1: ; --(unaligned)-----------------
419 » pcmpeqb xmmE,xmmE» » » ; xmmE=(all 1's) 370 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
420 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA 371 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
421 » add» rdi, byte SIZEOF_XMMWORD» ; outptr 372 » movdqu» XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
422 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [rdi], xmmD 373 » movdqu» XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
423 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
424 » maskmovdqu xmmC,xmmE» » » ; movntdqu XMMWORD [rdi], xmmC
425 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
426 » maskmovdqu xmmH,xmmE» » » ; movntdqu XMMWORD [rdi], xmmH
427 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
428 .out0: 374 .out0:
375 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
429 sub rcx, byte SIZEOF_XMMWORD 376 sub rcx, byte SIZEOF_XMMWORD
430 jz near .endcolumn 377 jz near .endcolumn
431 378
432 add rsi, byte SIZEOF_XMMWORD ; inptr0 379 add rsi, byte SIZEOF_XMMWORD ; inptr0
433 dec al ; Yctr 380 dec al ; Yctr
434 jnz near .Yloop_2nd 381 jnz near .Yloop_2nd
435 382
436 add rbx, byte SIZEOF_XMMWORD ; inptr1 383 add rbx, byte SIZEOF_XMMWORD ; inptr1
437 add rdx, byte SIZEOF_XMMWORD ; inptr2 384 add rdx, byte SIZEOF_XMMWORD ; inptr2
438 jmp near .columnloop 385 jmp near .columnloop
439 386
440 .column_st32: 387 .column_st32:
441 pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
442 cmp rcx, byte SIZEOF_XMMWORD/2 388 cmp rcx, byte SIZEOF_XMMWORD/2
443 jb short .column_st16 389 jb short .column_st16
444 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA 390 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
445 » add» rdi, byte SIZEOF_XMMWORD» ; outptr 391 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
446 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [rdi], xmmD 392 » add» rdi, byte 2*SIZEOF_XMMWORD» ; outptr
447 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
448 movdqa xmmA,xmmC 393 movdqa xmmA,xmmC
449 movdqa xmmD,xmmH 394 movdqa xmmD,xmmH
450 sub rcx, byte SIZEOF_XMMWORD/2 395 sub rcx, byte SIZEOF_XMMWORD/2
451 .column_st16: 396 .column_st16:
452 cmp rcx, byte SIZEOF_XMMWORD/4 397 cmp rcx, byte SIZEOF_XMMWORD/4
453 jb short .column_st15 398 jb short .column_st15
454 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA 399 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
455 add rdi, byte SIZEOF_XMMWORD ; outptr 400 add rdi, byte SIZEOF_XMMWORD ; outptr
456 movdqa xmmA,xmmD 401 movdqa xmmA,xmmD
457 sub rcx, byte SIZEOF_XMMWORD/4 402 sub rcx, byte SIZEOF_XMMWORD/4
458 .column_st15: 403 .column_st15:
459 %ifdef STRICT_MEMORY_ACCESS
460 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 404 ; Store two pixels (8 bytes) of xmmA to the output when it has enough
461 ; space. 405 ; space.
462 cmp rcx, byte SIZEOF_XMMWORD/8 406 cmp rcx, byte SIZEOF_XMMWORD/8
463 jb short .column_st7 407 jb short .column_st7
464 movq MMWORD [rdi], xmmA 408 movq MMWORD [rdi], xmmA
465 add rdi, byte SIZEOF_XMMWORD/8*4 409 add rdi, byte SIZEOF_XMMWORD/8*4
466 sub rcx, byte SIZEOF_XMMWORD/8 410 sub rcx, byte SIZEOF_XMMWORD/8
467 psrldq xmmA, SIZEOF_XMMWORD/8*4 411 psrldq xmmA, SIZEOF_XMMWORD/8*4
468 .column_st7: 412 .column_st7:
469 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 413 ; Store one pixel (4 bytes) of xmmA to the output when it has enough
470 ; space. 414 ; space.
471 test rcx, rcx 415 test rcx, rcx
472 jz short .endcolumn 416 jz short .endcolumn
473 movd DWORD [rdi], xmmA 417 movd DWORD [rdi], xmmA
474 %else
475 cmp rcx, byte SIZEOF_XMMWORD/16
476 jb near .endcolumn
477 mov rax,rcx
478 xor rcx, byte 0x03
479 inc rcx
480 shl rcx, 4
481 movd xmmF,ecx
482 psrlq xmmE,xmmF
483 punpcklbw xmmE,xmmE
484 ; ----------------
485 mov rcx,rdi
486 and rcx, byte SIZEOF_XMMWORD-1
487 jz short .adj0
488 lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
489 cmp rax, byte SIZEOF_XMMWORD
490 ja short .adj0
491 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
492 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
493 movdqa xmmB,xmmA
494 movdqa xmmG,xmmE
495 pslldq xmmA, SIZEOF_XMMWORD/2
496 pslldq xmmE, SIZEOF_XMMWORD/2
497 movd xmmC,ecx
498 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
499 jb short .adj1
500 movd xmmH,ecx
501 psllq xmmA,xmmH
502 psllq xmmE,xmmH
503 jmp short .adj0
504 .adj1: neg rcx
505 movd xmmH,ecx
506 psrlq xmmA,xmmH
507 psrlq xmmE,xmmH
508 psllq xmmB,xmmC
509 psllq xmmG,xmmC
510 por xmmA,xmmB
511 por xmmE,xmmG
512 .adj0: ; ----------------
513 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
514 %endif ; STRICT_MEMORY_ACCESS ; ---------------
515 418
516 %endif ; RGB_PIXELSIZE ; --------------- 419 %endif ; RGB_PIXELSIZE ; ---------------
517 420
518 .endcolumn: 421 .endcolumn:
519 sfence ; flush the write buffer 422 sfence ; flush the write buffer
520 423
521 .return: 424 .return:
522 pop rbx 425 pop rbx
523 uncollect_args 426 uncollect_args
524 mov rsp,rbp ; rsp <- aligned rbp 427 mov rsp,rbp ; rsp <- aligned rbp
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after
626 pop rdx 529 pop rdx
627 530
628 pop rbx 531 pop rbx
629 uncollect_args 532 uncollect_args
630 pop rbp 533 pop rbp
631 ret 534 ret
632 535
633 ; For some reason, the OS X linker does not honor the request to align the 536 ; For some reason, the OS X linker does not honor the request to align the
634 ; segment unless we do this. 537 ; segment unless we do this.
635 align 16 538 align 16
OLDNEW
« no previous file with comments | « simd/jdmrgss2.asm ('k') | simd/jsimdcfg.inc.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698