Snap for 11841552 from 8298efe1a467070e13ad1b5c22dd51c520998df4 to sdk-releasesdk-release

Change-Id: I7f5f673d201ea05400b33da2af2fa0fcba4dd3d3
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2024-05-14 23:05:05 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2024-05-14 23:05:05 +0000
commit: 8cd273c385df63e4ecf5caf000eb079f6032d255 (patch)
tree: b2fa22f7ab749029ed8757fbbceb2498d02d992b
parent: 228f7659121d25e44213779443a5ffc660449dd6 (diff)
parent: 8298efe1a467070e13ad1b5c22dd51c520998df4 (diff)
download: libaom-sdk-release.tar.gz
3 files changed, 136 insertions, 40 deletions
diff --git a/README.android b/README.android
index 02616e0ff..54d2961a6 100644
--- a/README.android
+++ b/README.android
@@ -46,3 +46,6 @@ Tools needed to build libaom:
 
 Generate config files that contain the source list for each platform.
 A list of prerequisites is at the top of generate_config.sh.
+
+Cherry-picks:
+f1b43b5c0d {,highbd_}intrapred_neon.c: Avoid over-reads in z1 and z3 preds
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index dc47974c6..e66f523e3 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -1293,6 +1293,33 @@ static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
       highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
 }
 
+// clang-format off
+static const uint8_t kLoadMaxShuffles[] = {
+  14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+  12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+  10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+   8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
+   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
+   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+};
+// clang-format on
+
+static INLINE uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
+                                             int shuffle_idx) {
+  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
+  uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
+#if AOM_ARCH_AARCH64
+  return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
+#else
+  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
+  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
+  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
+  return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
+#endif
+}
+
 static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
                                                    ptrdiff_t stride, int bw,
                                                    int bh,
@@ -1336,13 +1363,26 @@ static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
     } else {
       int c = 0;
       do {
-        const uint16x8_t a0 = vld1q_u16(&above[base + c]);
-        const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]);
-        const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift);
-        const uint16x8_t cmp =
-            vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8);
-        const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
-        vst1q_u16(dst + c, res);
+        uint16x8_t a0;
+        uint16x8_t a1;
+        if (base + c >= max_base_x) {
+          a0 = a1 = vdupq_n_u16(above_max);
+        } else {
+          if (base + c + 7 >= max_base_x) {
+            int shuffle_idx = max_base_x - base - c;
+            a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
+          } else {
+            a0 = vld1q_u16(above + base + c);
+          }
+          if (base + c + 8 >= max_base_x) {
+            int shuffle_idx = max_base_x - base - c - 1;
+            a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
+          } else {
+            a1 = vld1q_u16(above + base + c + 1);
+          }
+        }
+
+        vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
         c += 8;
       } while (c < bw);
     }
@@ -2456,13 +2496,29 @@ void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
     val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane));     \
     uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
     val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane));    \
-    const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base));          \
-    const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),    \
-                                        vrshrn_n_u32(val_hi, (shift)));   \
-    *(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res,      \
-                       vdupq_n_u16(left_max));                            \
+    *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),                  \
+                          vrshrn_n_u32(val_hi, (shift)));                 \
   } while (0)
 
+static INLINE uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
+                                             int max_ofs) {
+  uint16x8_t r0;
+  uint16x8_t r1;
+  if (ofs + 7 >= max_ofs) {
+    int shuffle_idx = max_ofs - ofs;
+    r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
+  } else {
+    r0 = vld1q_u16(left0 + ofs);
+  }
+  if (ofs + 8 >= max_ofs) {
+    int shuffle_idx = max_ofs - ofs - 1;
+    r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
+  } else {
+    r1 = vld1q_u16(left0 + ofs + 1);
+  }
+  return (uint16x8x2_t){ { r0, r1 } };
+}
+
 static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
                                                    ptrdiff_t stride, int bw,
                                                    int bh, const uint16_t *left,
@@ -2561,34 +2617,30 @@ static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
         if (base0 >= max_base_y) {
           out[0] = vdupq_n_u16(left_max);
         } else {
-          const uint16x8_t l00 = vld1q_u16(left + base0);
-          const uint16x8_t l01 = vld1q_u16(left1 + base0);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01,
-                                         shifts0, shifts1, 0, 6);
+          const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
+                                         l0.val[1], shifts0, shifts1, 0, 6);
         }
         if (base1 >= max_base_y) {
           out[1] = vdupq_n_u16(left_max);
         } else {
-          const uint16x8_t l10 = vld1q_u16(left + base1);
-          const uint16x8_t l11 = vld1q_u16(left1 + base1);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11,
-                                         shifts0, shifts1, 1, 6);
+          const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
+                                         l1.val[1], shifts0, shifts1, 1, 6);
         }
         if (base2 >= max_base_y) {
           out[2] = vdupq_n_u16(left_max);
         } else {
-          const uint16x8_t l20 = vld1q_u16(left + base2);
-          const uint16x8_t l21 = vld1q_u16(left1 + base2);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21,
-                                         shifts0, shifts1, 2, 6);
+          const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
+                                         l2.val[1], shifts0, shifts1, 2, 6);
         }
         if (base3 >= max_base_y) {
           out[3] = vdupq_n_u16(left_max);
         } else {
-          const uint16x8_t l30 = vld1q_u16(left + base3);
-          const uint16x8_t l31 = vld1q_u16(left1 + base3);
-          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31,
-                                         shifts0, shifts1, 3, 6);
+          const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
+                                         l3.val[1], shifts0, shifts1, 3, 6);
         }
         transpose_array_inplace_u16_4x8(out);
         for (int r2 = 0; r2 < 4; ++r2) {
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index c3716b3a7..2c99154fd 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1356,6 +1356,41 @@ static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+// clang-format off
+static const uint8_t kLoadMaxShuffles[] = {
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+   7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
+   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
+   5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
+   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
+   3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
+   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15,
+   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+};
+// clang-format on
+
+static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
+                                             int shuffle_idx) {
+  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
+  uint8x16_t src = vld1q_u8(ptr);
+#if AOM_ARCH_AARCH64
+  return vqtbl1q_u8(src, shuffle);
+#else
+  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
+  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
+  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
+  return vcombine_u8(lo, hi);
+#endif
+}
+
 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
                                        const uint8_t *above, int dx) {
   const int frac_bits = 6;
@@ -1369,7 +1404,6 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
 
   const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
-  const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x);
 
   int x = dx;
   for (int r = 0; r < N; r++, dst += stride) {
@@ -1391,12 +1425,24 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
                                                vcreate_u8(0x0F0E0D0C0B0A0908)));
 
     for (int j = 0; j < 64; j += 16) {
-      int mdif = max_base_x - (base + j);
-      if (mdif <= 0) {
+      if (base + j >= max_base_x) {
         vst1q_u8(dst + j, a_mbase_x);
       } else {
-        uint8x16_t a0_128 = vld1q_u8(above + base + j);
-        uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j);
+        uint8x16_t a0_128;
+        uint8x16_t a1_128;
+        if (base + j + 15 >= max_base_x) {
+          int shuffle_idx = max_base_x - base - j;
+          a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
+        } else {
+          a0_128 = vld1q_u8(above + base + j);
+        }
+        if (base + j + 16 >= max_base_x) {
+          int shuffle_idx = max_base_x - base - j - 1;
+          a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
+        } else {
+          a1_128 = vld1q_u8(above + base + j + 1);
+        }
+
         uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
         uint16x8_t diff_hi =
             vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
@@ -1406,13 +1452,8 @@ static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
             vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
         uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
         uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
-        uint8x16_t v_temp =
-            vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
-
-        uint8x16_t mask128 =
-            vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0));
-        uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x);
-        vst1q_u8(dst + j, res128);
+        vst1q_u8(dst + j,
+                 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
 
         base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
       }
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2024-05-14 23:05:05 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2024-05-14 23:05:05 +0000
commit	8cd273c385df63e4ecf5caf000eb079f6032d255 (patch)
tree	b2fa22f7ab749029ed8757fbbceb2498d02d992b
parent	228f7659121d25e44213779443a5ffc660449dd6 (diff)
parent	8298efe1a467070e13ad1b5c22dd51c520998df4 (diff)
download	libaom-sdk-release.tar.gz