| /* |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * under the License. |
| */ |
| |
| package org.apache.commons.sanselan.formats.jpeg.decoder; |
| |
| public class Dct |
| { |
| /* |
| * The book "JPEG still image data compression standard", |
| * by Pennebaker and Mitchell, Chapter 4, discusses a number of |
| * approaches to the fast DCT. Here's the cost, exluding modified |
| * (de)quantization, for transforming an 8x8 block: |
| * |
| * Algorithm Adds Multiplies RightShifts Total |
| * Naive 896 1024 0 1920 |
| * "Symmetries" 448 224 0 672 |
| * Vetterli and 464 208 0 672 |
| * Ligtenberg |
| * Arai, Agui and 464 80 0 544 |
| * Nakajima (AA&N) |
| * Feig 8x8 462 54 6 522 |
| * Fused mul/add 416 |
| * (a pipe dream) |
| * |
| * IJG's libjpeg, FFmpeg, and a number of others use AA&N. |
| * |
| * It would appear that Feig does 4-5% less operations, and |
| * multiplications are reduced from 80 in AA&N to only 54. |
| * But in practice: |
| * |
| * Benchmarks, Intel Core i3 @ 2.93 GHz in long mode, 4 GB RAM |
| * Time taken to do 100 million IDCTs (less is better): |
| * Rene' Stöckel's Feig, int: 45.07 seconds |
| * My Feig, floating point: 36.252 seconds |
| * AA&N, unrolled loops, double[][] -> double[][]: 25.167 seconds |
| * |
| * Clearly Feig is hopeless. I suspect the performance killer is simply |
| * the weight of the algorithm: massive number of local variables, |
| * large code size, and lots of random array accesses. |
| * |
| * Also, AA&N can be optimized a lot: |
| * AA&N, rolled loops, double[][] -> double[][]: 21.162 seconds |
| * AA&N, rolled loops, float[][] -> float[][]: no improvement, but |
| * at some stage Hotspot might start doing SIMD, so let's use float |
| * AA&N, rolled loops, float[] -> float[][]: 19.979 seconds |
| * apparently 2D arrays are slow! |
| * AA&N, rolled loops, inlined 1D AA&N transform, float[] |
| * transformed in-place: 18.5 seconds |
| * AA&N, previous version rewritten in C and compiled with "gcc -O3" |
| * takes: 8.5 seconds (probably due to heavy use of SIMD) |
| * |
| * Other brave attempts: |
| * AA&N, best float version converted to 16:16 fixed point: 23.923 seconds |
| * |
| * Anyway the best float version stays. |
| * 18.5 seconds = 5.4 million transforms per second per core :-) |
| */ |
| |
| private static final float[] dctScalingFactors = |
| { |
| (float)(0.5 / Math.sqrt(2.0)), |
| (float)(0.25 / Math.cos(Math.PI/16.0)), |
| (float)(0.25 / Math.cos(2.0*Math.PI/16.0)), |
| (float)(0.25 / Math.cos(3.0*Math.PI/16.0)), |
| (float)(0.25 / Math.cos(4.0*Math.PI/16.0)), |
| (float)(0.25 / Math.cos(5.0*Math.PI/16.0)), |
| (float)(0.25 / Math.cos(6.0*Math.PI/16.0)), |
| (float)(0.25 / Math.cos(7.0*Math.PI/16.0)), |
| }; |
| |
| private static final float[] idctScalingFactors = |
| { |
| (float)(2.0 * 4.0 / Math.sqrt(2.0) * 0.0625), |
| (float)(4.0 * Math.cos(Math.PI/16.0) * 0.125), |
| (float)(4.0 * Math.cos(2.0*Math.PI/16.0) * 0.125), |
| (float)(4.0 * Math.cos(3.0*Math.PI/16.0) * 0.125), |
| (float)(4.0 * Math.cos(4.0*Math.PI/16.0) * 0.125), |
| (float)(4.0 * Math.cos(5.0*Math.PI/16.0) * 0.125), |
| (float)(4.0 * Math.cos(6.0*Math.PI/16.0) * 0.125), |
| (float)(4.0 * Math.cos(7.0*Math.PI/16.0) * 0.125), |
| }; |
| |
| private static final float A1 = (float)(Math.cos(2.0*Math.PI/8.0)); |
| private static final float A2 = (float)(Math.cos(Math.PI/8.0) - Math.cos(3.0*Math.PI/8.0)); |
| private static final float A3 = A1; |
| private static final float A4 = (float)(Math.cos(Math.PI/8.0) + Math.cos(3.0*Math.PI/8.0)); |
| private static final float A5 = (float)(Math.cos(3.0*Math.PI/8.0)); |
| |
| private static final float C2 = (float)(2.0 * Math.cos(Math.PI/8)); |
| private static final float C4 = (float)(2.0 * Math.cos(2*Math.PI/8)); |
| private static final float C6 = (float)(2.0 * Math.cos(3*Math.PI/8)); |
| private static final float Q = C2 - C6; |
| private static final float R = C2 + C6; |
| |
| public static void scaleQuantizationVector(float[] vector) |
| { |
| for (int x = 0; x < 8; x++) |
| { |
| vector[x] *= dctScalingFactors[x]; |
| } |
| } |
| |
| public static void scaleDequantizationVector(float[] vector) |
| { |
| for (int x = 0; x < 8; x++) |
| { |
| vector[x] *= idctScalingFactors[x]; |
| } |
| } |
| |
| public static void scaleQuantizationMatrix(float[] matrix) |
| { |
| for (int y = 0; y < 8; y++) |
| { |
| for (int x = 0; x < 8; x++) |
| { |
| matrix[8 * y + x] *= dctScalingFactors[y] * dctScalingFactors[x]; |
| } |
| } |
| } |
| |
| public static void scaleDequantizationMatrix(float[] matrix) |
| { |
| for (int y = 0; y < 8; y++) |
| { |
| for (int x = 0; x < 8; x++) |
| { |
| matrix[8 * y + x] *= idctScalingFactors[y] * idctScalingFactors[x]; |
| } |
| } |
| } |
| |
| /** |
| * Fast forward Dct using AA&N. |
| * Taken from the book "JPEG still image data compression standard", |
| * by Pennebaker and Mitchell, chapter 4, figure "4-8". |
| */ |
| public static void forwardDCT8(float[] vector) |
| { |
| float a00 = vector[0] + vector[7]; |
| float a10 = vector[1] + vector[6]; |
| float a20 = vector[2] + vector[5]; |
| float a30 = vector[3] + vector[4]; |
| float a40 = vector[3] - vector[4]; |
| float a50 = vector[2] - vector[5]; |
| float a60 = vector[1] - vector[6]; |
| float a70 = vector[0] - vector[7]; |
| |
| float a01 = a00 + a30; |
| float a11 = a10 + a20; |
| float a21 = a10 - a20; |
| float a31 = a00 - a30; |
| // Avoid some negations: |
| // float a41 = -a40 - a50; |
| float neg_a41 = a40 + a50; |
| float a51 = a50 + a60; |
| float a61 = a60 + a70; |
| |
| float a22 = a21 + a31; |
| |
| float a23 = a22*A1; |
| float mul5 = (a61 - neg_a41)*A5; |
| float a43 = neg_a41*A2 - mul5; |
| float a53 = a51*A3; |
| float a63 = a61*A4 - mul5; |
| |
| float a54 = a70 + a53; |
| float a74 = a70 - a53; |
| |
| vector[0] = a01 + a11; |
| vector[4] = a01 - a11; |
| vector[2] = a31 + a23; |
| vector[6] = a31 - a23; |
| vector[5] = a74 + a43; |
| vector[1] = a54 + a63; |
| vector[7] = a54 - a63; |
| vector[3] = a74 - a43; |
| } |
| |
| public static void forwardDCT8x8(float[] matrix) |
| { |
| float a00, a10, a20, a30, a40, a50, a60, a70; |
| float a01, a11, a21, a31, neg_a41, a51, a61; |
| float a22, a23, mul5, a43, a53, a63; |
| float a54, a74; |
| |
| for (int i = 0; i < 8; i++) |
| { |
| a00 = matrix[8*i] + matrix[8*i + 7]; |
| a10 = matrix[8*i + 1] + matrix[8*i + 6]; |
| a20 = matrix[8*i + 2] + matrix[8*i + 5]; |
| a30 = matrix[8*i + 3] + matrix[8*i + 4]; |
| a40 = matrix[8*i + 3] - matrix[8*i + 4]; |
| a50 = matrix[8*i + 2] - matrix[8*i + 5]; |
| a60 = matrix[8*i + 1] - matrix[8*i + 6]; |
| a70 = matrix[8*i] - matrix[8*i + 7]; |
| a01 = a00 + a30; |
| a11 = a10 + a20; |
| a21 = a10 - a20; |
| a31 = a00 - a30; |
| neg_a41 = a40 + a50; |
| a51 = a50 + a60; |
| a61 = a60 + a70; |
| a22 = a21 + a31; |
| a23 = a22*A1; |
| mul5 = (a61 - neg_a41)*A5; |
| a43 = neg_a41*A2 - mul5; |
| a53 = a51*A3; |
| a63 = a61*A4 - mul5; |
| a54 = a70 + a53; |
| a74 = a70 - a53; |
| matrix[8*i] = a01 + a11; |
| matrix[8*i + 4] = a01 - a11; |
| matrix[8*i + 2] = a31 + a23; |
| matrix[8*i + 6] = a31 - a23; |
| matrix[8*i + 5] = a74 + a43; |
| matrix[8*i + 1] = a54 + a63; |
| matrix[8*i + 7] = a54 - a63; |
| matrix[8*i + 3] = a74 - a43; |
| } |
| |
| for (int i = 0; i < 8; i++) |
| { |
| a00 = matrix[i] + matrix[56 + i]; |
| a10 = matrix[8 + i] + matrix[48 + i]; |
| a20 = matrix[16 + i] + matrix[40 + i]; |
| a30 = matrix[24 + i] + matrix[32 + i]; |
| a40 = matrix[24 + i] - matrix[32 + i]; |
| a50 = matrix[16 + i] - matrix[40 + i]; |
| a60 = matrix[8 + i] - matrix[48 + i]; |
| a70 = matrix[i] - matrix[56 + i]; |
| a01 = a00 + a30; |
| a11 = a10 + a20; |
| a21 = a10 - a20; |
| a31 = a00 - a30; |
| neg_a41 = a40 + a50; |
| a51 = a50 + a60; |
| a61 = a60 + a70; |
| a22 = a21 + a31; |
| a23 = a22*A1; |
| mul5 = (a61 - neg_a41)*A5; |
| a43 = neg_a41*A2 - mul5; |
| a53 = a51*A3; |
| a63 = a61*A4 - mul5; |
| a54 = a70 + a53; |
| a74 = a70 - a53; |
| matrix[i] = a01 + a11; |
| matrix[32 + i] = a01 - a11; |
| matrix[16 + i] = a31 + a23; |
| matrix[48 + i] = a31 - a23; |
| matrix[40 + i] = a74 + a43; |
| matrix[8 + i] = a54 + a63; |
| matrix[56 + i] = a54 - a63; |
| matrix[24 + i] = a74 - a43; |
| } |
| } |
| |
| /** |
| * Fast inverse Dct using AA&N. This is taken from the beautiful |
| * http://vsr.informatik.tu-chemnitz.de/~jan/MPEG/HTML/IDCT.html |
| * which gives easy equations and properly explains constants and |
| * scaling factors. Terms have been inlined and the negation |
| * optimized out of existence. |
| */ |
| public static void inverseDCT8(float[] vector) |
| { |
| // B1 |
| float a2 = vector[2] - vector[6]; |
| float a3 = vector[2] + vector[6]; |
| float a4 = vector[5] - vector[3]; |
| float tmp1 = vector[1] + vector[7]; |
| float tmp2 = vector[3] + vector[5]; |
| float a5 = tmp1 - tmp2; |
| float a6 = vector[1] - vector[7]; |
| float a7 = tmp1 + tmp2; |
| |
| // M |
| float tmp4 = C6*(a4 + a6); |
| // Eliminate the negative: |
| // float b4 = -Q*a4 - tmp4; |
| float neg_b4 = Q*a4 + tmp4; |
| float b6 = R*a6 - tmp4; |
| float b2 = a2*C4; |
| float b5 = a5*C4; |
| |
| // A1 |
| float tmp3 = b6 - a7; |
| float n0 = tmp3 - b5; |
| float n1 = vector[0] - vector[4]; |
| float n2 = b2 - a3; |
| float n3 = vector[0] + vector[4]; |
| float neg_n5 = neg_b4; |
| |
| // A2 |
| float m3 = n1 + n2; |
| float m4 = n3 + a3; |
| float m5 = n1 - n2; |
| float m6 = n3 - a3; |
| // float m7 = n5 - n0; |
| float neg_m7 = neg_n5 + n0; |
| |
| // A3 |
| vector[0] = m4 + a7; |
| vector[1] = m3 + tmp3; |
| vector[2] = m5 - n0; |
| vector[3] = m6 + neg_m7; |
| vector[4] = m6 - neg_m7; |
| vector[5] = m5 + n0; |
| vector[6] = m3 - tmp3; |
| vector[7] = m4 - a7; |
| } |
| |
| public static void inverseDCT8x8(float[] matrix) |
| { |
| float a2, a3, a4, tmp1, tmp2, a5, a6, a7; |
| float tmp4, neg_b4, b6, b2, b5; |
| float tmp3, n0, n1, n2, n3, neg_n5; |
| float m3, m4, m5, m6, neg_m7; |
| |
| for (int i = 0; i < 8; i++) |
| { |
| a2 = matrix[8*i + 2] - matrix[8*i + 6]; |
| a3 = matrix[8*i + 2] + matrix[8*i + 6]; |
| a4 = matrix[8*i + 5] - matrix[8*i + 3]; |
| tmp1 = matrix[8*i + 1] + matrix[8*i + 7]; |
| tmp2 = matrix[8*i + 3] + matrix[8*i + 5]; |
| a5 = tmp1 - tmp2; |
| a6 = matrix[8*i + 1] - matrix[8*i + 7]; |
| a7 = tmp1 + tmp2; |
| tmp4 = C6*(a4 + a6); |
| neg_b4 = Q*a4 + tmp4; |
| b6 = R*a6 - tmp4; |
| b2 = a2*C4; |
| b5 = a5*C4; |
| tmp3 = b6 - a7; |
| n0 = tmp3 - b5; |
| n1 = matrix[8*i] - matrix[8*i + 4]; |
| n2 = b2 - a3; |
| n3 = matrix[8*i] + matrix[8*i + 4]; |
| neg_n5 = neg_b4; |
| m3 = n1 + n2; |
| m4 = n3 + a3; |
| m5 = n1 - n2; |
| m6 = n3 - a3; |
| neg_m7 = neg_n5 + n0; |
| matrix[8*i] = m4 + a7; |
| matrix[8*i + 1] = m3 + tmp3; |
| matrix[8*i + 2] = m5 - n0; |
| matrix[8*i + 3] = m6 + neg_m7; |
| matrix[8*i + 4] = m6 - neg_m7; |
| matrix[8*i + 5] = m5 + n0; |
| matrix[8*i + 6] = m3 - tmp3; |
| matrix[8*i + 7] = m4 - a7; |
| } |
| |
| for (int i = 0; i < 8; i++) |
| { |
| a2 = matrix[16 + i] - matrix[48 + i]; |
| a3 = matrix[16 + i] + matrix[48 + i]; |
| a4 = matrix[40 + i] - matrix[24 + i]; |
| tmp1 = matrix[8 + i] + matrix[56 + i]; |
| tmp2 = matrix[24 + i] + matrix[40 + i]; |
| a5 = tmp1 - tmp2; |
| a6 = matrix[8 + i] - matrix[56 + i]; |
| a7 = tmp1 + tmp2; |
| tmp4 = C6*(a4 + a6); |
| neg_b4 = Q*a4 + tmp4; |
| b6 = R*a6 - tmp4; |
| b2 = a2*C4; |
| b5 = a5*C4; |
| tmp3 = b6 - a7; |
| n0 = tmp3 - b5; |
| n1 = matrix[i] - matrix[32 + i]; |
| n2 = b2 - a3; |
| n3 = matrix[i] + matrix[32 + i]; |
| neg_n5 = neg_b4; |
| m3 = n1 + n2; |
| m4 = n3 + a3; |
| m5 = n1 - n2; |
| m6 = n3 - a3; |
| neg_m7 = neg_n5 + n0; |
| matrix[i] = m4 + a7; |
| matrix[8 + i] = m3 + tmp3; |
| matrix[16 + i] = m5 - n0; |
| matrix[24 + i] = m6 + neg_m7; |
| matrix[32 + i] = m6 - neg_m7; |
| matrix[40 + i] = m5 + n0; |
| matrix[48 + i] = m3 - tmp3; |
| matrix[56 + i] = m4 - a7; |
| } |
| } |
| } |