Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(68)

Side by Side Diff: simd/jdmrgss2.asm

Issue 10700197: Update libjpeg-turbo to r856. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libjpeg_turbo/
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « simd/jdclrss2-64.asm ('k') | simd/jdmrgss2-64.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 ; 1 ;
2 ; jdmrgss2.asm - merged upsampling/color conversion (SSE2) 2 ; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
3 ; 3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; 5 ;
6 ; Based on 6 ; Based on
7 ; x86 SIMD extension for IJG JPEG library 7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ; 10 ;
11 ; This file should be assembled with NASM (Netwide Assembler), 11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible 12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler). 13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or 14 ; NASM is available from http://nasm.sourceforge.net/ or
(...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after
257 257
258 cmp ecx, byte SIZEOF_XMMWORD 258 cmp ecx, byte SIZEOF_XMMWORD
259 jb short .column_st32 259 jb short .column_st32
260 260
261 test edi, SIZEOF_XMMWORD-1 261 test edi, SIZEOF_XMMWORD-1
262 jnz short .out1 262 jnz short .out1
263 ; --(aligned)------------------- 263 ; --(aligned)-------------------
264 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 264 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
265 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 265 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
266 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF 266 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
267 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
268 jmp short .out0 267 jmp short .out0
269 .out1: ; --(unaligned)----------------- 268 .out1: ; --(unaligned)-----------------
270 » pcmpeqb xmmH,xmmH» » » ; xmmH=(all 1's) 269 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
271 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA 270 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
272 » add» edi, byte SIZEOF_XMMWORD» ; outptr 271 » movdqu» XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
273 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [edi], xmmD
274 » add» edi, byte SIZEOF_XMMWORD» ; outptr
275 » maskmovdqu xmmF,xmmH» » » ; movntdqu XMMWORD [edi], xmmF
276 » add» edi, byte SIZEOF_XMMWORD» ; outptr
277 .out0: 272 .out0:
273 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
278 sub ecx, byte SIZEOF_XMMWORD 274 sub ecx, byte SIZEOF_XMMWORD
279 jz near .endcolumn 275 jz near .endcolumn
280 276
281 add esi, byte SIZEOF_XMMWORD ; inptr0 277 add esi, byte SIZEOF_XMMWORD ; inptr0
282 dec al ; Yctr 278 dec al ; Yctr
283 jnz near .Yloop_2nd 279 jnz near .Yloop_2nd
284 280
285 add ebx, byte SIZEOF_XMMWORD ; inptr1 281 add ebx, byte SIZEOF_XMMWORD ; inptr1
286 add edx, byte SIZEOF_XMMWORD ; inptr2 282 add edx, byte SIZEOF_XMMWORD ; inptr2
287 jmp near .columnloop 283 jmp near .columnloop
288 alignx 16,7 284 alignx 16,7
289 285
290 .column_st32: 286 .column_st32:
291 pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
292 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 287 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
293 cmp ecx, byte 2*SIZEOF_XMMWORD 288 cmp ecx, byte 2*SIZEOF_XMMWORD
294 jb short .column_st16 289 jb short .column_st16
295 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA 290 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
296 » add» edi, byte SIZEOF_XMMWORD» ; outptr 291 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
297 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [edi], xmmD 292 » add» edi, byte 2*SIZEOF_XMMWORD» ; outptr
298 » add» edi, byte SIZEOF_XMMWORD» ; outptr
299 movdqa xmmA,xmmF 293 movdqa xmmA,xmmF
300 sub ecx, byte 2*SIZEOF_XMMWORD 294 sub ecx, byte 2*SIZEOF_XMMWORD
301 jmp short .column_st15 295 jmp short .column_st15
302 .column_st16: 296 .column_st16:
303 cmp ecx, byte SIZEOF_XMMWORD 297 cmp ecx, byte SIZEOF_XMMWORD
304 jb short .column_st15 298 jb short .column_st15
305 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [edi], xmmA 299 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
306 add edi, byte SIZEOF_XMMWORD ; outptr 300 add edi, byte SIZEOF_XMMWORD ; outptr
307 movdqa xmmA,xmmD 301 movdqa xmmA,xmmD
308 sub ecx, byte SIZEOF_XMMWORD 302 sub ecx, byte SIZEOF_XMMWORD
309 .column_st15: 303 .column_st15:
310 %ifdef STRICT_MEMORY_ACCESS
311 ; Store the lower 8 bytes of xmmA to the output when it has enough 304 ; Store the lower 8 bytes of xmmA to the output when it has enough
312 ; space. 305 ; space.
313 cmp ecx, byte SIZEOF_MMWORD 306 cmp ecx, byte SIZEOF_MMWORD
314 jb short .column_st7 307 jb short .column_st7
315 movq MMWORD [edi], xmmA 308 movq MMWORD [edi], xmmA
316 add edi, byte SIZEOF_MMWORD 309 add edi, byte SIZEOF_MMWORD
317 sub ecx, byte SIZEOF_MMWORD 310 sub ecx, byte SIZEOF_MMWORD
318 psrldq xmmA, SIZEOF_MMWORD 311 psrldq xmmA, SIZEOF_MMWORD
319 .column_st7: 312 .column_st7:
320 ; Store the lower 4 bytes of xmmA to the output when it has enough 313 ; Store the lower 4 bytes of xmmA to the output when it has enough
(...skipping 13 matching lines...) Expand all
334 mov WORD [edi], ax 327 mov WORD [edi], ax
335 add edi, byte SIZEOF_WORD 328 add edi, byte SIZEOF_WORD
336 sub ecx, byte SIZEOF_WORD 329 sub ecx, byte SIZEOF_WORD
337 shr eax, 16 330 shr eax, 16
338 .column_st1: 331 .column_st1:
339 ; Store the lower 1 byte of eax to the output when it has enough 332 ; Store the lower 1 byte of eax to the output when it has enough
340 ; space. 333 ; space.
341 test ecx, ecx 334 test ecx, ecx
342 jz short .endcolumn 335 jz short .endcolumn
343 mov BYTE [edi], al 336 mov BYTE [edi], al
344 %else
345 mov eax,ecx
346 xor ecx, byte 0x0F
347 shl ecx, 2
348 movd xmmB,ecx
349 psrlq xmmH,4
350 pcmpeqb xmmE,xmmE
351 psrlq xmmH,xmmB
352 psrlq xmmE,xmmB
353 punpcklbw xmmE,xmmH
354 ; ----------------
355 mov ecx,edi
356 and ecx, byte SIZEOF_XMMWORD-1
357 jz short .adj0
358 add eax,ecx
359 cmp eax, byte SIZEOF_XMMWORD
360 ja short .adj0
361 and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
362 shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
363 movdqa xmmG,xmmA
364 movdqa xmmC,xmmE
365 pslldq xmmA, SIZEOF_XMMWORD/2
366 pslldq xmmE, SIZEOF_XMMWORD/2
367 movd xmmD,ecx
368 sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
369 jb short .adj1
370 movd xmmF,ecx
371 psllq xmmA,xmmF
372 psllq xmmE,xmmF
373 jmp short .adj0
374 .adj1: neg ecx
375 movd xmmF,ecx
376 psrlq xmmA,xmmF
377 psrlq xmmE,xmmF
378 psllq xmmG,xmmD
379 psllq xmmC,xmmD
380 por xmmA,xmmG
381 por xmmE,xmmC
382 .adj0: ; ----------------
383 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
384 %endif ; STRICT_MEMORY_ACCESS ; ---------------
385 337
386 %else ; RGB_PIXELSIZE == 4 ; ----------- 338 %else ; RGB_PIXELSIZE == 4 ; -----------
387 339
388 %ifdef RGBX_FILLER_0XFF 340 %ifdef RGBX_FILLER_0XFF
389 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 341 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
390 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 342 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
391 %else 343 %else
392 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 344 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
393 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 345 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
394 %endif 346 %endif
(...skipping 24 matching lines...) Expand all
419 cmp ecx, byte SIZEOF_XMMWORD 371 cmp ecx, byte SIZEOF_XMMWORD
420 jb short .column_st32 372 jb short .column_st32
421 373
422 test edi, SIZEOF_XMMWORD-1 374 test edi, SIZEOF_XMMWORD-1
423 jnz short .out1 375 jnz short .out1
424 ; --(aligned)------------------- 376 ; --(aligned)-------------------
425 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 377 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
426 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 378 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
427 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC 379 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
428 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH 380 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
429 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
430 jmp short .out0 381 jmp short .out0
431 .out1: ; --(unaligned)----------------- 382 .out1: ; --(unaligned)-----------------
432 » pcmpeqb xmmE,xmmE» » » ; xmmE=(all 1's) 383 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
433 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA 384 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
434 » add» edi, byte SIZEOF_XMMWORD» ; outptr 385 » movdqu» XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
435 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [edi], xmmD 386 » movdqu» XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
436 » add» edi, byte SIZEOF_XMMWORD» ; outptr
437 » maskmovdqu xmmC,xmmE» » » ; movntdqu XMMWORD [edi], xmmC
438 » add» edi, byte SIZEOF_XMMWORD» ; outptr
439 » maskmovdqu xmmH,xmmE» » » ; movntdqu XMMWORD [edi], xmmH
440 » add» edi, byte SIZEOF_XMMWORD» ; outptr
441 .out0: 387 .out0:
388 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
442 sub ecx, byte SIZEOF_XMMWORD 389 sub ecx, byte SIZEOF_XMMWORD
443 jz near .endcolumn 390 jz near .endcolumn
444 391
445 add esi, byte SIZEOF_XMMWORD ; inptr0 392 add esi, byte SIZEOF_XMMWORD ; inptr0
446 dec al ; Yctr 393 dec al ; Yctr
447 jnz near .Yloop_2nd 394 jnz near .Yloop_2nd
448 395
449 add ebx, byte SIZEOF_XMMWORD ; inptr1 396 add ebx, byte SIZEOF_XMMWORD ; inptr1
450 add edx, byte SIZEOF_XMMWORD ; inptr2 397 add edx, byte SIZEOF_XMMWORD ; inptr2
451 jmp near .columnloop 398 jmp near .columnloop
452 alignx 16,7 399 alignx 16,7
453 400
454 .column_st32: 401 .column_st32:
455 pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
456 cmp ecx, byte SIZEOF_XMMWORD/2 402 cmp ecx, byte SIZEOF_XMMWORD/2
457 jb short .column_st16 403 jb short .column_st16
458 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA 404 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
459 » add» edi, byte SIZEOF_XMMWORD» ; outptr 405 » movdqu» XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
460 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [edi], xmmD 406 » add» edi, byte 2*SIZEOF_XMMWORD» ; outptr
461 » add» edi, byte SIZEOF_XMMWORD» ; outptr
462 movdqa xmmA,xmmC 407 movdqa xmmA,xmmC
463 movdqa xmmD,xmmH 408 movdqa xmmD,xmmH
464 sub ecx, byte SIZEOF_XMMWORD/2 409 sub ecx, byte SIZEOF_XMMWORD/2
465 .column_st16: 410 .column_st16:
466 cmp ecx, byte SIZEOF_XMMWORD/4 411 cmp ecx, byte SIZEOF_XMMWORD/4
467 jb short .column_st15 412 jb short .column_st15
468 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [edi], xmmA 413 » movdqu» XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
469 add edi, byte SIZEOF_XMMWORD ; outptr 414 add edi, byte SIZEOF_XMMWORD ; outptr
470 movdqa xmmA,xmmD 415 movdqa xmmA,xmmD
471 sub ecx, byte SIZEOF_XMMWORD/4 416 sub ecx, byte SIZEOF_XMMWORD/4
472 .column_st15: 417 .column_st15:
473 %ifdef STRICT_MEMORY_ACCESS
474 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 418 ; Store two pixels (8 bytes) of xmmA to the output when it has enough
475 ; space. 419 ; space.
476 cmp ecx, byte SIZEOF_XMMWORD/8 420 cmp ecx, byte SIZEOF_XMMWORD/8
477 jb short .column_st7 421 jb short .column_st7
478 movq MMWORD [edi], xmmA 422 movq MMWORD [edi], xmmA
479 » add» edi, byte SIZEOF_XMMWORD/2 423 » add» edi, byte SIZEOF_XMMWORD/8*4
480 sub ecx, byte SIZEOF_XMMWORD/8 424 sub ecx, byte SIZEOF_XMMWORD/8
481 » psrldq» xmmA, 64 425 » psrldq» xmmA, SIZEOF_XMMWORD/8*4
482 .column_st7: 426 .column_st7:
483 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 427 ; Store one pixel (4 bytes) of xmmA to the output when it has enough
484 ; space. 428 ; space.
485 test ecx, ecx 429 test ecx, ecx
486 jz short .endcolumn 430 jz short .endcolumn
487 movd DWORD [edi], xmmA 431 movd DWORD [edi], xmmA
488 %else
489 cmp ecx, byte SIZEOF_XMMWORD/16
490 jb short .endcolumn
491 mov eax,ecx
492 xor ecx, byte 0x03
493 inc ecx
494 shl ecx, 4
495 movd xmmF,ecx
496 psrlq xmmE,xmmF
497 punpcklbw xmmE,xmmE
498 ; ----------------
499 mov ecx,edi
500 and ecx, byte SIZEOF_XMMWORD-1
501 jz short .adj0
502 lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
503 cmp eax, byte SIZEOF_XMMWORD
504 ja short .adj0
505 and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
506 shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
507 movdqa xmmB,xmmA
508 movdqa xmmG,xmmE
509 pslldq xmmA, SIZEOF_XMMWORD/2
510 pslldq xmmE, SIZEOF_XMMWORD/2
511 movd xmmC,ecx
512 sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
513 jb short .adj1
514 movd xmmH,ecx
515 psllq xmmA,xmmH
516 psllq xmmE,xmmH
517 jmp short .adj0
518 .adj1: neg ecx
519 movd xmmH,ecx
520 psrlq xmmA,xmmH
521 psrlq xmmE,xmmH
522 psllq xmmB,xmmC
523 psllq xmmG,xmmC
524 por xmmA,xmmB
525 por xmmE,xmmG
526 .adj0: ; ----------------
527 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
528 %endif ; STRICT_MEMORY_ACCESS ; ---------------
529 432
530 %endif ; RGB_PIXELSIZE ; --------------- 433 %endif ; RGB_PIXELSIZE ; ---------------
531 434
532 .endcolumn: 435 .endcolumn:
533 sfence ; flush the write buffer 436 sfence ; flush the write buffer
534 437
535 .return: 438 .return:
536 pop edi 439 pop edi
537 pop esi 440 pop esi
538 ; pop edx ; need not be preserved 441 ; pop edx ; need not be preserved
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
606 pop esi 509 pop esi
607 ; pop edx ; need not be preserved 510 ; pop edx ; need not be preserved
608 ; pop ecx ; need not be preserved 511 ; pop ecx ; need not be preserved
609 pop ebx 512 pop ebx
610 pop ebp 513 pop ebp
611 ret 514 ret
612 515
613 ; For some reason, the OS X linker does not honor the request to align the 516 ; For some reason, the OS X linker does not honor the request to align the
614 ; segment unless we do this. 517 ; segment unless we do this.
615 align 16 518 align 16
OLDNEW
« no previous file with comments | « simd/jdclrss2-64.asm ('k') | simd/jdmrgss2-64.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698