OLD | NEW |
| (Empty) |
1 // qcms | |
2 // Copyright (C) 2009 Mozilla Foundation | |
3 // | |
4 // Permission is hereby granted, free of charge, to any person obtaining | |
5 // a copy of this software and associated documentation files (the "Software"), | |
6 // to deal in the Software without restriction, including without limitation | |
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 // and/or sell copies of the Software, and to permit persons to whom the Softwar
e | |
9 // is furnished to do so, subject to the following conditions: | |
10 // | |
11 // The above copyright notice and this permission notice shall be included in | |
12 // all copies or substantial portions of the Software. | |
13 // | |
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
15 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO | |
16 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
17 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | |
18 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
19 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | |
20 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
21 | |
22 #include <emmintrin.h> | |
23 | |
24 #include "qcmsint.h" | |
25 | |
26 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequ
ence */ | |
27 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) | |
28 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZ
E ) | |
29 static const ALIGN float floatScaleX4[4] = | |
30 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; | |
31 static const ALIGN float clampMaxValueX4[4] = | |
32 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL}; | |
33 | |
34 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, | |
35 unsigned char *src, | |
36 unsigned char *dest, | |
37 size_t length) | |
38 { | |
39 unsigned int i; | |
40 float (*mat)[4] = transform->matrix; | |
41 char input_back[32]; | |
42 /* Ensure we have a buffer that's 16 byte aligned regardless of the original | |
43 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(
align(32)) | |
44 * because they don't work on stack variables. gcc 4.4 does do the right thi
ng | |
45 * on x86 but that's too new for us right now. For more info: gcc bug #16660
*/ | |
46 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); | |
47 /* share input and output locations to save having to keep the | |
48 * locations in separate registers */ | |
49 uint32_t const * output = (uint32_t*)input; | |
50 | |
51 /* deref *transform now to avoid it in loop */ | |
52 const float *igtbl_r = transform->input_gamma_table_r; | |
53 const float *igtbl_g = transform->input_gamma_table_g; | |
54 const float *igtbl_b = transform->input_gamma_table_b; | |
55 | |
56 /* deref *transform now to avoid it in loop */ | |
57 const uint8_t *otdata_r = &transform->output_table_r->data[0]; | |
58 const uint8_t *otdata_g = &transform->output_table_g->data[0]; | |
59 const uint8_t *otdata_b = &transform->output_table_b->data[0]; | |
60 | |
61 /* input matrix values never change */ | |
62 const __m128 mat0 = _mm_load_ps(mat[0]); | |
63 const __m128 mat1 = _mm_load_ps(mat[1]); | |
64 const __m128 mat2 = _mm_load_ps(mat[2]); | |
65 | |
66 /* these values don't change, either */ | |
67 const __m128 max = _mm_load_ps(clampMaxValueX4); | |
68 const __m128 min = _mm_setzero_ps(); | |
69 const __m128 scale = _mm_load_ps(floatScaleX4); | |
70 | |
71 /* working variables */ | |
72 __m128 vec_r, vec_g, vec_b, result; | |
73 | |
74 /* CYA */ | |
75 if (!length) | |
76 return; | |
77 | |
78 /* one pixel is handled outside of the loop */ | |
79 length--; | |
80 | |
81 /* setup for transforming 1st pixel */ | |
82 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
83 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
84 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
85 src += 3; | |
86 | |
87 /* transform all but final pixel */ | |
88 | |
89 for (i=0; i<length; i++) | |
90 { | |
91 /* position values from gamma tables */ | |
92 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
93 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
94 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
95 | |
96 /* gamma * matrix */ | |
97 vec_r = _mm_mul_ps(vec_r, mat0); | |
98 vec_g = _mm_mul_ps(vec_g, mat1); | |
99 vec_b = _mm_mul_ps(vec_b, mat2); | |
100 | |
101 /* crunch, crunch, crunch */ | |
102 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
103 vec_r = _mm_max_ps(min, vec_r); | |
104 vec_r = _mm_min_ps(max, vec_r); | |
105 result = _mm_mul_ps(vec_r, scale); | |
106 | |
107 /* store calc'd output tables indices */ | |
108 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); | |
109 | |
110 /* load for next loop while store completes */ | |
111 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
112 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
113 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
114 src += 3; | |
115 | |
116 /* use calc'd indices to output RGB values */ | |
117 dest[0] = otdata_r[output[0]]; | |
118 dest[1] = otdata_g[output[1]]; | |
119 dest[2] = otdata_b[output[2]]; | |
120 dest += 3; | |
121 } | |
122 | |
123 /* handle final (maybe only) pixel */ | |
124 | |
125 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
126 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
127 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
128 | |
129 vec_r = _mm_mul_ps(vec_r, mat0); | |
130 vec_g = _mm_mul_ps(vec_g, mat1); | |
131 vec_b = _mm_mul_ps(vec_b, mat2); | |
132 | |
133 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
134 vec_r = _mm_max_ps(min, vec_r); | |
135 vec_r = _mm_min_ps(max, vec_r); | |
136 result = _mm_mul_ps(vec_r, scale); | |
137 | |
138 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); | |
139 | |
140 dest[0] = otdata_r[output[0]]; | |
141 dest[1] = otdata_g[output[1]]; | |
142 dest[2] = otdata_b[output[2]]; | |
143 } | |
144 | |
145 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, | |
146 unsigned char *src, | |
147 unsigned char *dest, | |
148 size_t length) | |
149 { | |
150 unsigned int i; | |
151 float (*mat)[4] = transform->matrix; | |
152 char input_back[32]; | |
153 /* Ensure we have a buffer that's 16 byte aligned regardless of the original | |
154 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(
align(32)) | |
155 * because they don't work on stack variables. gcc 4.4 does do the right thi
ng | |
156 * on x86 but that's too new for us right now. For more info: gcc bug #16660
*/ | |
157 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); | |
158 /* share input and output locations to save having to keep the | |
159 * locations in separate registers */ | |
160 uint32_t const * output = (uint32_t*)input; | |
161 | |
162 /* deref *transform now to avoid it in loop */ | |
163 const float *igtbl_r = transform->input_gamma_table_r; | |
164 const float *igtbl_g = transform->input_gamma_table_g; | |
165 const float *igtbl_b = transform->input_gamma_table_b; | |
166 | |
167 /* deref *transform now to avoid it in loop */ | |
168 const uint8_t *otdata_r = &transform->output_table_r->data[0]; | |
169 const uint8_t *otdata_g = &transform->output_table_g->data[0]; | |
170 const uint8_t *otdata_b = &transform->output_table_b->data[0]; | |
171 | |
172 /* input matrix values never change */ | |
173 const __m128 mat0 = _mm_load_ps(mat[0]); | |
174 const __m128 mat1 = _mm_load_ps(mat[1]); | |
175 const __m128 mat2 = _mm_load_ps(mat[2]); | |
176 | |
177 /* these values don't change, either */ | |
178 const __m128 max = _mm_load_ps(clampMaxValueX4); | |
179 const __m128 min = _mm_setzero_ps(); | |
180 const __m128 scale = _mm_load_ps(floatScaleX4); | |
181 | |
182 /* working variables */ | |
183 __m128 vec_r, vec_g, vec_b, result; | |
184 unsigned char alpha; | |
185 | |
186 /* CYA */ | |
187 if (!length) | |
188 return; | |
189 | |
190 /* one pixel is handled outside of the loop */ | |
191 length--; | |
192 | |
193 /* setup for transforming 1st pixel */ | |
194 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
195 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
196 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
197 alpha = src[3]; | |
198 src += 4; | |
199 | |
200 /* transform all but final pixel */ | |
201 | |
202 for (i=0; i<length; i++) | |
203 { | |
204 /* position values from gamma tables */ | |
205 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
206 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
207 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
208 | |
209 /* gamma * matrix */ | |
210 vec_r = _mm_mul_ps(vec_r, mat0); | |
211 vec_g = _mm_mul_ps(vec_g, mat1); | |
212 vec_b = _mm_mul_ps(vec_b, mat2); | |
213 | |
214 /* store alpha for this pixel; load alpha for next */ | |
215 dest[3] = alpha; | |
216 alpha = src[3]; | |
217 | |
218 /* crunch, crunch, crunch */ | |
219 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
220 vec_r = _mm_max_ps(min, vec_r); | |
221 vec_r = _mm_min_ps(max, vec_r); | |
222 result = _mm_mul_ps(vec_r, scale); | |
223 | |
224 /* store calc'd output tables indices */ | |
225 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); | |
226 | |
227 /* load gamma values for next loop while store completes */ | |
228 vec_r = _mm_load_ss(&igtbl_r[src[0]]); | |
229 vec_g = _mm_load_ss(&igtbl_g[src[1]]); | |
230 vec_b = _mm_load_ss(&igtbl_b[src[2]]); | |
231 src += 4; | |
232 | |
233 /* use calc'd indices to output RGB values */ | |
234 dest[0] = otdata_r[output[0]]; | |
235 dest[1] = otdata_g[output[1]]; | |
236 dest[2] = otdata_b[output[2]]; | |
237 dest += 4; | |
238 } | |
239 | |
240 /* handle final (maybe only) pixel */ | |
241 | |
242 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); | |
243 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); | |
244 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); | |
245 | |
246 vec_r = _mm_mul_ps(vec_r, mat0); | |
247 vec_g = _mm_mul_ps(vec_g, mat1); | |
248 vec_b = _mm_mul_ps(vec_b, mat2); | |
249 | |
250 dest[3] = alpha; | |
251 | |
252 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); | |
253 vec_r = _mm_max_ps(min, vec_r); | |
254 vec_r = _mm_min_ps(max, vec_r); | |
255 result = _mm_mul_ps(vec_r, scale); | |
256 | |
257 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); | |
258 | |
259 dest[0] = otdata_r[output[0]]; | |
260 dest[1] = otdata_g[output[1]]; | |
261 dest[2] = otdata_b[output[2]]; | |
262 } | |
OLD | NEW |