Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(38)

Side by Side Diff: simd/jdclrss2-64.asm

Issue 10700197: Update libjpeg-turbo to r856. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libjpeg_turbo/
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « simd/jdclrss2.asm ('k') | simd/jdmrgss2.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 ; 1 ;
2 ; jdclrss2-64.asm - colorspace conversion (64-bit SSE2) 2 ; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
3 ; 3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 4 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright 2009 D. R. Commander 5 ; Copyright 2009 D. R. Commander
6 ; 6 ;
7 ; Based on 7 ; Based on
8 ; x86 SIMD extension for IJG JPEG library 8 ; x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; 11 ;
12 ; This file should be assembled with NASM (Netwide Assembler), 12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible 13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler). 14 ; assembler (including Borland's Turbo Assembler).
(...skipping 229 matching lines...) Expand 10 before | Expand all | Expand 10 after
244 244
245 cmp rcx, byte SIZEOF_XMMWORD 245 cmp rcx, byte SIZEOF_XMMWORD
246 jb short .column_st32 246 jb short .column_st32
247 247
248 test rdi, SIZEOF_XMMWORD-1 248 test rdi, SIZEOF_XMMWORD-1
249 jnz short .out1 249 jnz short .out1
250 ; --(aligned)------------------- 250 ; --(aligned)-------------------
251 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 251 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
252 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 252 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
253 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 253 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
254 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
255 jmp short .out0 254 jmp short .out0
256 .out1: ; --(unaligned)----------------- 255 .out1: ; --(unaligned)-----------------
257 » pcmpeqb xmmH,xmmH» » » ; xmmH=(all 1's) 256 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
258 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA 257 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
259 » add» rdi, byte SIZEOF_XMMWORD» ; outptr 258 » movdqu» XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
260 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [rdi], xmmD
261 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
262 » maskmovdqu xmmF,xmmH» » » ; movntdqu XMMWORD [rdi], xmmF
263 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
264 .out0: 259 .out0:
260 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
265 sub rcx, byte SIZEOF_XMMWORD 261 sub rcx, byte SIZEOF_XMMWORD
266 jz near .nextrow 262 jz near .nextrow
267 263
268 add rsi, byte SIZEOF_XMMWORD ; inptr0 264 add rsi, byte SIZEOF_XMMWORD ; inptr0
269 add rbx, byte SIZEOF_XMMWORD ; inptr1 265 add rbx, byte SIZEOF_XMMWORD ; inptr1
270 add rdx, byte SIZEOF_XMMWORD ; inptr2 266 add rdx, byte SIZEOF_XMMWORD ; inptr2
271 jmp near .columnloop 267 jmp near .columnloop
272 268
273 .column_st32: 269 .column_st32:
274 pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
275 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE 270 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
276 cmp rcx, byte 2*SIZEOF_XMMWORD 271 cmp rcx, byte 2*SIZEOF_XMMWORD
277 jb short .column_st16 272 jb short .column_st16
278 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA 273 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
279 » add» rdi, byte SIZEOF_XMMWORD» ; outptr 274 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
280 » maskmovdqu xmmD,xmmH» » » ; movntdqu XMMWORD [rdi], xmmD 275 » add» rdi, byte 2*SIZEOF_XMMWORD» ; outptr
281 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
282 movdqa xmmA,xmmF 276 movdqa xmmA,xmmF
283 sub rcx, byte 2*SIZEOF_XMMWORD 277 sub rcx, byte 2*SIZEOF_XMMWORD
284 jmp short .column_st15 278 jmp short .column_st15
285 .column_st16: 279 .column_st16:
286 cmp rcx, byte SIZEOF_XMMWORD 280 cmp rcx, byte SIZEOF_XMMWORD
287 jb short .column_st15 281 jb short .column_st15
288 » maskmovdqu xmmA,xmmH» » » ; movntdqu XMMWORD [rdi], xmmA 282 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
289 add rdi, byte SIZEOF_XMMWORD ; outptr 283 add rdi, byte SIZEOF_XMMWORD ; outptr
290 movdqa xmmA,xmmD 284 movdqa xmmA,xmmD
291 sub rcx, byte SIZEOF_XMMWORD 285 sub rcx, byte SIZEOF_XMMWORD
292 .column_st15: 286 .column_st15:
293 %ifdef STRICT_MEMORY_ACCESS
294 ; Store the lower 8 bytes of xmmA to the output when it has enough 287 ; Store the lower 8 bytes of xmmA to the output when it has enough
295 ; space. 288 ; space.
296 cmp rcx, byte SIZEOF_MMWORD 289 cmp rcx, byte SIZEOF_MMWORD
297 jb short .column_st7 290 jb short .column_st7
298 movq MMWORD [rdi], xmmA 291 movq MMWORD [rdi], xmmA
299 add rdi, byte SIZEOF_MMWORD 292 add rdi, byte SIZEOF_MMWORD
300 sub rcx, byte SIZEOF_MMWORD 293 sub rcx, byte SIZEOF_MMWORD
301 psrldq xmmA, SIZEOF_MMWORD 294 psrldq xmmA, SIZEOF_MMWORD
302 .column_st7: 295 .column_st7:
303 ; Store the lower 4 bytes of xmmA to the output when it has enough 296 ; Store the lower 4 bytes of xmmA to the output when it has enough
(...skipping 13 matching lines...) Expand all
317 mov WORD [rdi], ax 310 mov WORD [rdi], ax
318 add rdi, byte SIZEOF_WORD 311 add rdi, byte SIZEOF_WORD
319 sub rcx, byte SIZEOF_WORD 312 sub rcx, byte SIZEOF_WORD
320 shr rax, 16 313 shr rax, 16
321 .column_st1: 314 .column_st1:
322 ; Store the lower 1 byte of rax to the output when it has enough 315 ; Store the lower 1 byte of rax to the output when it has enough
323 ; space. 316 ; space.
324 test rcx, rcx 317 test rcx, rcx
325 jz short .nextrow 318 jz short .nextrow
326 mov BYTE [rdi], al 319 mov BYTE [rdi], al
327 %else
328 mov rax,rcx
329 xor rcx, byte 0x0F
330 shl rcx, 2
331 movd xmmB,ecx
332 psrlq xmmH,4
333 pcmpeqb xmmE,xmmE
334 psrlq xmmH,xmmB
335 psrlq xmmE,xmmB
336 punpcklbw xmmE,xmmH
337 ; ----------------
338 mov rcx,rdi
339 and rcx, byte SIZEOF_XMMWORD-1
340 jz short .adj0
341 add rax,rcx
342 cmp rax, byte SIZEOF_XMMWORD
343 ja short .adj0
344 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
345 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
346 movdqa xmmG,xmmA
347 movdqa xmmC,xmmE
348 pslldq xmmA, SIZEOF_XMMWORD/2
349 pslldq xmmE, SIZEOF_XMMWORD/2
350 movd xmmD,ecx
351 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
352 jb short .adj1
353 movd xmmF,ecx
354 psllq xmmA,xmmF
355 psllq xmmE,xmmF
356 jmp short .adj0
357 .adj1: neg ecx
358 movd xmmF,ecx
359 psrlq xmmA,xmmF
360 psrlq xmmE,xmmF
361 psllq xmmG,xmmD
362 psllq xmmC,xmmD
363 por xmmA,xmmG
364 por xmmE,xmmC
365 .adj0: ; ----------------
366 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
367 %endif ; STRICT_MEMORY_ACCESS ; ---------------
368 320
369 %else ; RGB_PIXELSIZE == 4 ; ----------- 321 %else ; RGB_PIXELSIZE == 4 ; -----------
370 322
371 %ifdef RGBX_FILLER_0XFF 323 %ifdef RGBX_FILLER_0XFF
372 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 324 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
373 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 325 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
374 %else 326 %else
375 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 327 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
376 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 328 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
377 %endif 329 %endif
(...skipping 24 matching lines...) Expand all
402 cmp rcx, byte SIZEOF_XMMWORD 354 cmp rcx, byte SIZEOF_XMMWORD
403 jb short .column_st32 355 jb short .column_st32
404 356
405 test rdi, SIZEOF_XMMWORD-1 357 test rdi, SIZEOF_XMMWORD-1
406 jnz short .out1 358 jnz short .out1
407 ; --(aligned)------------------- 359 ; --(aligned)-------------------
408 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 360 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
409 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 361 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
410 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 362 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
411 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 363 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
412 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
413 jmp short .out0 364 jmp short .out0
414 .out1: ; --(unaligned)----------------- 365 .out1: ; --(unaligned)-----------------
415 » pcmpeqb xmmE,xmmE» » » ; xmmE=(all 1's) 366 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
416 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA 367 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
417 » add» rdi, byte SIZEOF_XMMWORD» ; outptr 368 » movdqu» XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
418 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [rdi], xmmD 369 » movdqu» XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
419 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
420 » maskmovdqu xmmC,xmmE» » » ; movntdqu XMMWORD [rdi], xmmC
421 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
422 » maskmovdqu xmmH,xmmE» » » ; movntdqu XMMWORD [rdi], xmmH
423 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
424 .out0: 370 .out0:
371 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
425 sub rcx, byte SIZEOF_XMMWORD 372 sub rcx, byte SIZEOF_XMMWORD
426 jz near .nextrow 373 jz near .nextrow
427 374
428 add rsi, byte SIZEOF_XMMWORD ; inptr0 375 add rsi, byte SIZEOF_XMMWORD ; inptr0
429 add rbx, byte SIZEOF_XMMWORD ; inptr1 376 add rbx, byte SIZEOF_XMMWORD ; inptr1
430 add rdx, byte SIZEOF_XMMWORD ; inptr2 377 add rdx, byte SIZEOF_XMMWORD ; inptr2
431 jmp near .columnloop 378 jmp near .columnloop
432 379
433 .column_st32: 380 .column_st32:
434 pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
435 cmp rcx, byte SIZEOF_XMMWORD/2 381 cmp rcx, byte SIZEOF_XMMWORD/2
436 jb short .column_st16 382 jb short .column_st16
437 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA 383 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
438 » add» rdi, byte SIZEOF_XMMWORD» ; outptr 384 » movdqu» XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
439 » maskmovdqu xmmD,xmmE» » » ; movntdqu XMMWORD [rdi], xmmD 385 » add» rdi, byte 2*SIZEOF_XMMWORD» ; outptr
440 » add» rdi, byte SIZEOF_XMMWORD» ; outptr
441 movdqa xmmA,xmmC 386 movdqa xmmA,xmmC
442 movdqa xmmD,xmmH 387 movdqa xmmD,xmmH
443 sub rcx, byte SIZEOF_XMMWORD/2 388 sub rcx, byte SIZEOF_XMMWORD/2
444 .column_st16: 389 .column_st16:
445 cmp rcx, byte SIZEOF_XMMWORD/4 390 cmp rcx, byte SIZEOF_XMMWORD/4
446 jb short .column_st15 391 jb short .column_st15
447 » maskmovdqu xmmA,xmmE» » » ; movntdqu XMMWORD [rdi], xmmA 392 » movdqu» XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
448 add rdi, byte SIZEOF_XMMWORD ; outptr 393 add rdi, byte SIZEOF_XMMWORD ; outptr
449 movdqa xmmA,xmmD 394 movdqa xmmA,xmmD
450 sub rcx, byte SIZEOF_XMMWORD/4 395 sub rcx, byte SIZEOF_XMMWORD/4
451 .column_st15: 396 .column_st15:
452 %ifdef STRICT_MEMORY_ACCESS
453 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 397 ; Store two pixels (8 bytes) of xmmA to the output when it has enough
454 ; space. 398 ; space.
455 cmp rcx, byte SIZEOF_XMMWORD/8 399 cmp rcx, byte SIZEOF_XMMWORD/8
456 jb short .column_st7 400 jb short .column_st7
457 movq MMWORD [rdi], xmmA 401 movq MMWORD [rdi], xmmA
458 add rdi, byte SIZEOF_XMMWORD/8*4 402 add rdi, byte SIZEOF_XMMWORD/8*4
459 sub rcx, byte SIZEOF_XMMWORD/8 403 sub rcx, byte SIZEOF_XMMWORD/8
460 psrldq xmmA, SIZEOF_XMMWORD/8*4 404 psrldq xmmA, SIZEOF_XMMWORD/8*4
461 .column_st7: 405 .column_st7:
462 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 406 ; Store one pixel (4 bytes) of xmmA to the output when it has enough
463 ; space. 407 ; space.
464 test rcx, rcx 408 test rcx, rcx
465 jz short .nextrow 409 jz short .nextrow
466 movd DWORD [rdi], xmmA 410 movd DWORD [rdi], xmmA
467 %else
468 cmp rcx, byte SIZEOF_XMMWORD/16
469 jb near .nextrow
470 mov rax,rcx
471 xor rcx, byte 0x03
472 inc rcx
473 shl rcx, 4
474 movd xmmF,ecx
475 psrlq xmmE,xmmF
476 punpcklbw xmmE,xmmE
477 ; ----------------
478 mov rcx,rdi
479 and rcx, byte SIZEOF_XMMWORD-1
480 jz short .adj0
481 lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
482 cmp rax, byte SIZEOF_XMMWORD
483 ja short .adj0
484 and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
485 shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
486 movdqa xmmB,xmmA
487 movdqa xmmG,xmmE
488 pslldq xmmA, SIZEOF_XMMWORD/2
489 pslldq xmmE, SIZEOF_XMMWORD/2
490 movd xmmC,ecx
491 sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
492 jb short .adj1
493 movd xmmH,ecx
494 psllq xmmA,xmmH
495 psllq xmmE,xmmH
496 jmp short .adj0
497 .adj1: neg rcx
498 movd xmmH,ecx
499 psrlq xmmA,xmmH
500 psrlq xmmE,xmmH
501 psllq xmmB,xmmC
502 psllq xmmG,xmmC
503 por xmmA,xmmB
504 por xmmE,xmmG
505 .adj0: ; ----------------
506 maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
507 %endif ; STRICT_MEMORY_ACCESS ; ---------------
508 411
509 %endif ; RGB_PIXELSIZE ; --------------- 412 %endif ; RGB_PIXELSIZE ; ---------------
510 413
511 .nextrow: 414 .nextrow:
512 pop rcx 415 pop rcx
513 pop rsi 416 pop rsi
514 pop rbx 417 pop rbx
515 pop rdx 418 pop rdx
516 pop rdi 419 pop rdi
517 pop rax 420 pop rax
(...skipping 11 matching lines...) Expand all
529 pop rbx 432 pop rbx
530 uncollect_args 433 uncollect_args
531 mov rsp,rbp ; rsp <- aligned rbp 434 mov rsp,rbp ; rsp <- aligned rbp
532 pop rsp ; rsp <- original rbp 435 pop rsp ; rsp <- original rbp
533 pop rbp 436 pop rbp
534 ret 437 ret
535 438
536 ; For some reason, the OS X linker does not honor the request to align the 439 ; For some reason, the OS X linker does not honor the request to align the
537 ; segment unless we do this. 440 ; segment unless we do this.
538 align 16 441 align 16
OLDNEW
« no previous file with comments | « simd/jdclrss2.asm ('k') | simd/jdmrgss2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698