헤비 프래그먼트 셰이더의 성능 최적화

다음 쉐이더 세트를 최적화하는 데 도움이 필요합니다.

꼭지점:

    precision mediump float;

uniform vec2 rubyTextureSize;

attribute vec4 vPosition;
attribute vec2 a_TexCoordinate;

varying vec2 tc;

void main() {
    gl_Position = vPosition;

    tc = a_TexCoordinate;
}

파편:

precision mediump float;

/*
 Uniforms
 - rubyTexture: texture sampler
 - rubyTextureSize: size of the texture before rendering
 */

uniform sampler2D rubyTexture;
uniform vec2 rubyTextureSize;
uniform vec2 rubyTextureFract;

/*
 Varying attributes
 - tc: coordinate of the texel being processed
 - xyp_[]_[]_[]: a packed coordinate for 3 areas within the texture
 */

varying vec2 tc;

/*
 Constants
 */
/*
 Inequation coefficients for interpolation
 Equations are in the form: Ay + Bx = C
 45, 30, and 60 denote the angle from x each line the cooeficient variable set builds
 */
const vec4 Ai = vec4(1.0, -1.0, -1.0, 1.0);
const vec4 B45 = vec4(1.0, 1.0, -1.0, -1.0);
const vec4 C45 = vec4(1.5, 0.5, -0.5, 0.5);
const vec4 B30 = vec4(0.5, 2.0, -0.5, -2.0);
const vec4 C30 = vec4(1.0, 1.0, -0.5, 0.0);
const vec4 B60 = vec4(2.0, 0.5, -2.0, -0.5);
const vec4 C60 = vec4(2.0, 0.0, -1.0, 0.5);

const vec4 M45 = vec4(0.4, 0.4, 0.4, 0.4);
const vec4 M30 = vec4(0.2, 0.4, 0.2, 0.4);
const vec4 M60 = M30.yxwz;
const vec4 Mshift = vec4(0.2);

// Coefficient for weighted edge detection
const float coef = 2.0;
// Threshold for if luminance values are "equal"
const vec4 threshold = vec4(0.32);

// Conversion from RGB to Luminance (from GIMP)
const vec3 lum = vec3(0.21, 0.72, 0.07);

// Performs same logic operation as && for vectors
bvec4 _and_(bvec4 A, bvec4 B) {
    return bvec4(A.x && B.x, A.y && B.y, A.z && B.z, A.w && B.w);
}

// Performs same logic operation as || for vectors
bvec4 _or_(bvec4 A, bvec4 B) {
    return bvec4(A.x || B.x, A.y || B.y, A.z || B.z, A.w || B.w);
}

// Converts 4 3-color vectors into 1 4-value luminance vector
vec4 lum_to(vec3 v0, vec3 v1, vec3 v2, vec3 v3) {
    //    return vec4(dot(lum, v0), dot(lum, v1), dot(lum, v2), dot(lum, v3));

    return mat4(v0.x, v1.x, v2.x, v3.x, v0.y, v1.y, v2.y, v3.y, v0.z, v1.z,
            v2.z, v3.z, 0.0, 0.0, 0.0, 0.0) * vec4(lum, 0.0);
}

// Gets the difference between 2 4-value luminance vectors
vec4 lum_df(vec4 A, vec4 B) {
    return abs(A - B);
}

// Determines if 2 4-value luminance vectors are "equal" based on threshold
bvec4 lum_eq(vec4 A, vec4 B) {
    return lessThan(lum_df(A, B), threshold);
}

vec4 lum_wd(vec4 a, vec4 b, vec4 c, vec4 d, vec4 e, vec4 f, vec4 g, vec4 h) {
    return lum_df(a, b) + lum_df(a, c) + lum_df(d, e) + lum_df(d, f)
            + 4.0 * lum_df(g, h);
}

// Gets the difference between 2 3-value rgb colors
float c_df(vec3 c1, vec3 c2) {
    vec3 df = abs(c1 - c2);
    return df.r + df.g + df.b;
}

void main() {

    /*
     Mask for algorhithm
     +-----+-----+-----+-----+-----+
     |     |  1  |  2  |  3  |     |
     +-----+-----+-----+-----+-----+
     |  5  |  6  |  7  |  8  |  9  |
     +-----+-----+-----+-----+-----+
     | 10  | 11  | 12  | 13  | 14  |
     +-----+-----+-----+-----+-----+
     | 15  | 16  | 17  | 18  | 19  |
     +-----+-----+-----+-----+-----+
     |     | 21  | 22  | 23  |     |
     +-----+-----+-----+-----+-----+
     */

    float x = rubyTextureFract.x;
    float y = rubyTextureFract.y;

    vec4 xyp_1_2_3 = tc.xxxy + vec4(-x, 0.0, x, -2.0 * y);
    vec4 xyp_6_7_8 = tc.xxxy + vec4(-x, 0.0, x, -y);
    vec4 xyp_11_12_13 = tc.xxxy + vec4(-x, 0.0, x, 0.0);
    vec4 xyp_16_17_18 = tc.xxxy + vec4(-x, 0.0, x, y);
    vec4 xyp_21_22_23 = tc.xxxy + vec4(-x, 0.0, x, 2.0 * y);
    vec4 xyp_5_10_15 = tc.xyyy + vec4(-2.0 * x, -y, 0.0, y);
    vec4 xyp_9_14_9 = tc.xyyy + vec4(2.0 * x, -y, 0.0, y);

    // Get mask values by performing texture lookup with the uniform sampler
    vec3 P1 = texture2D(rubyTexture, xyp_1_2_3.xw).rgb;
    vec3 P2 = texture2D(rubyTexture, xyp_1_2_3.yw).rgb;
    vec3 P3 = texture2D(rubyTexture, xyp_1_2_3.zw).rgb;

    vec3 P6 = texture2D(rubyTexture, xyp_6_7_8.xw).rgb;
    vec3 P7 = texture2D(rubyTexture, xyp_6_7_8.yw).rgb;
    vec3 P8 = texture2D(rubyTexture, xyp_6_7_8.zw).rgb;

    vec3 P11 = texture2D(rubyTexture, xyp_11_12_13.xw).rgb;
    vec3 P12 = texture2D(rubyTexture, xyp_11_12_13.yw).rgb;
    vec3 P13 = texture2D(rubyTexture, xyp_11_12_13.zw).rgb;

    vec3 P16 = texture2D(rubyTexture, xyp_16_17_18.xw).rgb;
    vec3 P17 = texture2D(rubyTexture, xyp_16_17_18.yw).rgb;
    vec3 P18 = texture2D(rubyTexture, xyp_16_17_18.zw).rgb;

    vec3 P21 = texture2D(rubyTexture, xyp_21_22_23.xw).rgb;
    vec3 P22 = texture2D(rubyTexture, xyp_21_22_23.yw).rgb;
    vec3 P23 = texture2D(rubyTexture, xyp_21_22_23.zw).rgb;

    vec3 P5 = texture2D(rubyTexture, xyp_5_10_15.xy).rgb;
    vec3 P10 = texture2D(rubyTexture, xyp_5_10_15.xz).rgb;
    vec3 P15 = texture2D(rubyTexture, xyp_5_10_15.xw).rgb;

    vec3 P9 = texture2D(rubyTexture, xyp_9_14_9.xy).rgb;
    vec3 P14 = texture2D(rubyTexture, xyp_9_14_9.xz).rgb;
    vec3 P19 = texture2D(rubyTexture, xyp_9_14_9.xw).rgb;

    // Store luminance values of each point in groups of 4
    // so that we may operate on all four corners at once
    vec4 p7 = lum_to(P7, P11, P17, P13);
    vec4 p8 = lum_to(P8, P6, P16, P18);
    vec4 p11 = p7.yzwx; // P11, P17, P13, P7
    vec4 p12 = lum_to(P12, P12, P12, P12);
    vec4 p13 = p7.wxyz; // P13, P7,  P11, P17
    vec4 p14 = lum_to(P14, P2, P10, P22);
    vec4 p16 = p8.zwxy; // P16, P18, P8,  P6
    vec4 p17 = p7.zwxy; // P17, P13, P7,  P11
    vec4 p18 = p8.wxyz; // P18, P8,  P6,  P16
    vec4 p19 = lum_to(P19, P3, P5, P21);
    vec4 p22 = p14.wxyz; // P22, P14, P2,  P10
    vec4 p23 = lum_to(P23, P9, P1, P15);

    // Scale current texel coordinate to [0..1]
    vec2 fp = fract(tc * rubyTextureSize);

    // Determine amount of "smoothing" or mixing that could be done on texel corners
    vec4 AiMulFpy = Ai * fp.y;
    vec4 B45MulFpx = B45 * fp.x;
    vec4 ma45 = smoothstep(C45 - M45, C45 + M45, AiMulFpy + B45MulFpx);
    vec4 ma30 = smoothstep(C30 - M30, C30 + M30, AiMulFpy + B30 * fp.x);
    vec4 ma60 = smoothstep(C60 - M60, C60 + M60, AiMulFpy + B60 * fp.x);
    vec4 marn = smoothstep(C45 - M45 + Mshift, C45 + M45 + Mshift,
            AiMulFpy + B45MulFpx);

    // Perform edge weight calculations
    vec4 e45 = lum_wd(p12, p8, p16, p18, p22, p14, p17, p13);
    vec4 econt = lum_wd(p17, p11, p23, p13, p7, p19, p12, p18);
    vec4 e30 = lum_df(p13, p16);
    vec4 e60 = lum_df(p8, p17);

    // Calculate rule results for interpolation
    bvec4 r45_1 = _and_(notEqual(p12, p13), notEqual(p12, p17));
    bvec4 r45_2 = _and_(not (lum_eq(p13, p7)), not (lum_eq(p13, p8)));
    bvec4 r45_3 = _and_(not (lum_eq(p17, p11)), not (lum_eq(p17, p16)));
    bvec4 r45_4_1 = _and_(not (lum_eq(p13, p14)), not (lum_eq(p13, p19)));
    bvec4 r45_4_2 = _and_(not (lum_eq(p17, p22)), not (lum_eq(p17, p23)));
    bvec4 r45_4 = _and_(lum_eq(p12, p18), _or_(r45_4_1, r45_4_2));
    bvec4 r45_5 = _or_(lum_eq(p12, p16), lum_eq(p12, p8));
    bvec4 r45 = _and_(r45_1, _or_(_or_(_or_(r45_2, r45_3), r45_4), r45_5));
    bvec4 r30 = _and_(notEqual(p12, p16), notEqual(p11, p16));
    bvec4 r60 = _and_(notEqual(p12, p8), notEqual(p7, p8));

    // Combine rules with edge weights
    bvec4 edr45 = _and_(lessThan(e45, econt), r45);
    bvec4 edrrn = lessThanEqual(e45, econt);
    bvec4 edr30 = _and_(lessThanEqual(coef * e30, e60), r30);
    bvec4 edr60 = _and_(lessThanEqual(coef * e60, e30), r60);

    // Finalize interpolation rules and cast to float (0.0 for false, 1.0 for true)
    vec4 final45 = vec4(_and_(_and_(not (edr30), not (edr60)), edr45));
    vec4 final30 = vec4(_and_(_and_(edr45, not (edr60)), edr30));
    vec4 final60 = vec4(_and_(_and_(edr45, not (edr30)), edr60));
    vec4 final36 = vec4(_and_(_and_(edr60, edr30), edr45));
    vec4 finalrn = vec4(_and_(not (edr45), edrrn));

    // Determine the color to mix with for each corner
    vec4 px = step(lum_df(p12, p17), lum_df(p12, p13));

    // Determine the mix amounts by combining the final rule result and corresponding
    // mix amount for the rule in each corner
    vec4 mac = final36 * max(ma30, ma60) + final30 * ma30 + final60 * ma60
            + final45 * ma45 + finalrn * marn;

    /*
     Calculate the resulting color by traversing clockwise and counter-clockwise around
     the corners of the texel

     Finally choose the result that has the largest difference from the texel's original
     color
     */
    vec3 res1 = P12;
    res1 = mix(res1, mix(P13, P17, px.x), mac.x);
    res1 = mix(res1, mix(P7, P13, px.y), mac.y);
    res1 = mix(res1, mix(P11, P7, px.z), mac.z);
    res1 = mix(res1, mix(P17, P11, px.w), mac.w);

    vec3 res2 = P12;
    res2 = mix(res2, mix(P17, P11, px.w), mac.w);
    res2 = mix(res2, mix(P11, P7, px.z), mac.z);
    res2 = mix(res2, mix(P7, P13, px.y), mac.y);
    res2 = mix(res2, mix(P13, P17, px.x), mac.x);

    gl_FragColor = vec4(mix(res1, res2, step(c_df(P12, res1), c_df(P12, res2))),
            1.0);
}

셰이더는 2D 텍스처를 수신하며 고해상도 2D 표면 (장치 화면)에 걸쳐 아름답게 스케일링됩니다. 중요한 경우 SABR 스케일링 알고리즘의 최적화입니다.

LG Nexus 4와 같은 고급 Android 기기에서는 이미 작동하고 정상적으로 작동하지만 약한 기기에서는 실제로 느립니다.

나에게 중요한 Android 기기는 Samsung Galaxy S 2 \ 3이며 Mali 400MP GPU는이 쉐이더에서 끔찍하게 작동합니다.

지금까지 나는 시도했다 :

변경 사항 제거 (ARM Mali 안내서의 조언)-약간의 개선이있었습니다.
내 자신의 mix () 함수를 재정의하면 좋지 않습니다.
float 정밀도를 lowp로 줄이는 것-아무것도 변경하지 않았습니다.

렌더링 시간을 계산하여 성능을 측정합니다 (eglSwapBuffers 전후). 이것은 매우 선형적이고 일관된 성능 측정을 제공합니다.

그 외에도, 나는 어디에서 볼 것인지 또는 여기서 무엇을 최적화 할 수 있는지 모르겠습니다 ...

나는 이것이 무거운 알고리즘이라는 것을 알고 있으며, 어떤 대체 스케일링 방법을 사용할 지에 대한 조언을 요구하지 않습니다. 동일한 알고리즘을 최적화 된 방식으로 사용하고 싶습니다.

최신 정보

종속 벡터 대신 상수 벡터로 모든 텍스처 가져 오기를 수행하면 성능이 크게 향상되므로 캐시로 인해 병목 현상이 커질 수 있습니다. 그러나 여전히 그 인출을해야합니다. vec2 변형 (최소한의 스위블 링 없음)으로 가져 오기 중 일부를 수행했지만 아무것도 향상시키지 못했습니다. 21 텍셀을 효율적으로 폴링하는 좋은 방법이 무엇인지 궁금합니다.
출력의 크기가 적어도 x2만큼 조정되고 GL_NEAREST로 폴링하기 때문에 계산의 주요 부분이 정확히 동일한 텍셀 세트로 여러 번 수행되고 있음을 알았습니다. 정확히 동일한 텍셀에 해당하는 조각이 4 개 이상 있습니다. 스케일링이 고해상도 장치에서 x4 인 경우 동일한 텍셀에 16 개의 조각이 있으며 이는 큰 낭비입니다. 여러 조각에서 변경되지 않는 모든 값을 계산하는 추가 셰이더 패스를 수행하는 방법이 있습니까? 추가 오프 스크린 텍스처로 렌더링하는 것에 대해 생각했지만 하나가 아닌 텍셀 당 여러 값을 저장해야합니다.

최신 정보

또한 GPU가 큰 병목 현상을 일으키는 동안 CPU가 거의 사용되지 않는 것으로 나타났습니다. 이 상황에서 CPU 성능을 활용하고 GPU에서 CPU로 로직을 전송하는 방법에 대한 조언이 있습니까?

— 선생님
소스

텍스처를 룩업으로 가져 와서는 안됩니다. 픽셀 버스터가 텍스처를 가져올 시간을 갖도록 정점에서 uv를 전달하십시오.

— Tordin

설명해 주시겠습니까? uv은 무슨 뜻인가요?

— SirKnigget

"SABR 스케일링 알고리즘"에 대한 설명으로 링크 할 수 있습니까? Google은 이에 대한 유용한 정보를 찾지 못했습니다. 그건 그렇고, 모바일 GPU의 21 텍셀 필터 (그리고 수학적으로 무겁습니다)는 문제를 요구하고 있습니다. 어딘가에서 품질을 저하시키지 않으면 서 현실적으로 잘 작동 할 것으로 기대할 수 있다고 생각하지 않습니다.

— Nathan Reed

이것은 내가 찾은 정확한 구현은 아니지만 일반적인 아이디어를 제공합니다 : board.byuu.org/viewtopic.php?f=10&t=2248

— SirKnigget

현실적인 기대와 관련하여 고급 장치에서 효과적입니다. 나는 약 5 배 정도의 요소로 미세 조정하고 약한 장치에서 작동하도록 기대할 수 있습니다.

— SirKnigget

21 텍셀을 효율적으로 폴링하는 좋은 방법이 무엇인지 궁금합니다.

답은 21 텍셀을 폴링하지 않는 효율적인 방법입니다. 분명히 유감이지만 모바일 장치에는 이러한 커널을 지원하는 데 필요한 버스 너비가 없을 수 있습니다. 캐싱이 더 큰 커널 반경을 커버하도록 샘플러에 연결된 텍스처의 크기를 줄여 최적화해야합니다.

또한 디스크 커널을 잊어 버리고 수직 커널을 사용하는 2 패스 알고리즘과 순수한 수평을 사용하는 2 패스 알고리즘을 사용할 수 있습니다. 이렇게하면 "2D"에서 "1D"로 전달하여 말하기 수를 크게 줄입니다. 선형 액세스 덕분에 캐시 성능을 향상시킬뿐만 아니라 샘플링도 가능합니다.

Z 스토리지 텍스처가 GPU 메모리에 배열되어야하므로 수직 페치는 캐시 성능에 영향을 미치지 않습니다. cf http://en.wikipedia.org/wiki/Z-order_curve

— v.oddou
소스