diff options
author | Yongqin Liu <yongqin.liu@linaro.org> | 2015-07-22 19:33:43 +0800 |
---|---|---|
committer | Yongqin Liu <yongqin.liu@linaro.org> | 2015-07-22 19:33:43 +0800 |
commit | 94006fea0784791772b70a46961c3e99459fbf19 (patch) | |
tree | 170a04c88111287f2d103cec86fda25f6cfe74cc | |
parent | 5c731541fe1352827cb36e0954035537a2d80726 (diff) | |
download | blitVH-test-94006fea0784791772b70a46961c3e99459fbf19.tar.gz |
Signed-off-by: Yongqin Liu <yongqin.liu@linaro.org>
-rw-r--r-- | Android.mk | 16 | ||||
-rw-r--r-- | blit.c | 169 | ||||
-rw-r--r-- | blit.h | 5 | ||||
-rwxr-xr-x | run.sh | 14 | ||||
-rw-r--r-- | test_blit.c | 219 | ||||
-rw-r--r-- | timer.c | 38 | ||||
-rw-r--r-- | timer.h | 10 |
7 files changed, 471 insertions, 0 deletions
diff --git a/Android.mk b/Android.mk new file mode 100644 index 0000000..f897ede --- /dev/null +++ b/Android.mk @@ -0,0 +1,16 @@ +LOCAL_PATH := $(call my-dir) + +###################################################### +### blit_test ### +###################################################### +include $(CLEAR_VARS) +LOCAL_SRC_FILES := blit.c test_blit.c timer.c + +LOCAL_MODULE_TAGS := debug +LOCAL_MODULE := blit_test + +LOCAL_MULTILIB := both +LOCAL_MODULE_STEM_64 := $(LOCAL_MODULE)64 +LOCAL_MODULE_STEM_32 := $(LOCAL_MODULE)32 + +include $(BUILD_EXECUTABLE) @@ -0,0 +1,169 @@ +#include<stdio.h> +#include<stddef.h> +#include<stdint.h> +#include<arm_neon.h> + +#define SK_B16_BITS 5 +#define SK_G16_BITS 6 +#define SK_R16_BITS 5 + +#define SK_R16_SHIFT (SK_B16_BITS + SK_G16_BITS) +#define SK_G16_SHIFT (SK_B16_BITS) + +#define SK_G16_MASK ((1 << SK_G16_BITS) - 1) +#define SK_B16_MASK ((1 << SK_B16_BITS) - 1) + +#define SK_G16_MASK_IN_PLACE (SK_G16_MASK << SK_G16_SHIFT) + + +static inline uint32_t SkExpand_rgb_16(unsigned c) { + return ((c & SK_G16_MASK_IN_PLACE) << 16) | (c & ~SK_G16_MASK_IN_PLACE); +} + +static inline unsigned SkCompact_rgb_16(uint32_t c) { + return ((c >> 16) & SK_G16_MASK_IN_PLACE) | (c & ~SK_G16_MASK_IN_PLACE); +} + +void SkRGB16_Opaque_Blitter_blitV_c(uint16_t* device, size_t deviceRB, int height, uint8_t alpha) { + unsigned scale5 = alpha >> 3; + uint32_t src32 = 256 * scale5; + scale5 = 32 - scale5; + do { + uint32_t dst32 = SkExpand_rgb_16(*device) * scale5; + *device = SkCompact_rgb_16((src32 + dst32) >> 5); + device = (uint16_t*)((char*)device + deviceRB); + } while (--height != 0); +} + +#define LOAD_LANE_16(reg, n) \ + reg = vld1q_lane_u16(device, reg, n); \ + device = (uint16_t*)((char*)device + deviceRB); + +#define STORE_LANE_16(reg, n) \ + vst1_lane_u16(dst, reg, n); \ + dst = (uint16_t*)((char*)dst + deviceRB); + +void SkRGB16_Opaque_Blitter_blitV_neon(uint16_t* device, size_t deviceRB, int height, uint8_t alpha) +{ + unsigned scale = alpha >> 3; + uint32_t src32 = 256 * scale; + scale = 32 - scale; + if (height >= 8) + { + uint16_t* dst = device; + + // prepare constants + uint16x8_t vdev = vdupq_n_u16(0); + uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); + uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); + uint32x4_t vsrc32 = vdupq_n_u32(src32); + uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); + + while (height >= 8){ + LOAD_LANE_16(vdev, 0) + LOAD_LANE_16(vdev, 1) + LOAD_LANE_16(vdev, 2) + LOAD_LANE_16(vdev, 3) + LOAD_LANE_16(vdev, 4) + LOAD_LANE_16(vdev, 5) + LOAD_LANE_16(vdev, 6) + LOAD_LANE_16(vdev, 7) + + // Expand_rgb_16 + uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); + uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); + uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); + + // Compact_rgb_16 + vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); + vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); + vdst32_lo = vshrq_n_u32(vdst32_lo, 5); + vdst32_hi = vshrq_n_u32(vdst32_hi, 5); + + uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); + uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); + uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); + vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); + vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); + uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); + + STORE_LANE_16(vdst16_lo, 0) + STORE_LANE_16(vdst16_lo, 1) + STORE_LANE_16(vdst16_lo, 2) + STORE_LANE_16(vdst16_lo, 3) + STORE_LANE_16(vdst16_hi, 0) + STORE_LANE_16(vdst16_hi, 1) + STORE_LANE_16(vdst16_hi, 2) + STORE_LANE_16(vdst16_hi, 3) + height -= 8; + } + } + while (height != 0){ + uint32_t dst32 = SkExpand_rgb_16(*device) * scale; + *device = SkCompact_rgb_16((src32 + dst32) >> 5); + device = (uint16_t*)((char*)device + deviceRB); + height--; + } +} +#undef LOAD_LANE_16 +#undef STORE_LANE_16 + +void SkRGB16_Opaque_Blitter_blitH_c(uint16_t* device, int height, uint8_t alpha) { + unsigned scale5 = alpha >> 3; + uint32_t src32 = 256 * scale5; + scale5 = 32 - scale5; + do { + uint32_t dst32 = SkExpand_rgb_16(*device) * scale5; + *device++ = SkCompact_rgb_16((src32 + dst32) >> 5); + } while (--height != 0); +} + +void SkRGB16_Opaque_Blitter_blitH_neon(uint16_t* device, int height, uint8_t alpha) +{ + unsigned scale = alpha >> 3; + uint32_t src32 = 256 * scale; + scale = 32 - scale; + if (height >= 8) + { + uint16_t* dst = device; + + // prepare constants + uint16x8_t vdev = vdupq_n_u16(0); + uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); + uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); + uint32x4_t vsrc32 = vdupq_n_u32(src32); + uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); + + while (height >= 8){ + vdev = vld1q_u16(device); + + // Expand_rgb_16 + uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); + uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); + uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); + + // Compact_rgb_16 + vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); + vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); + vdst32_lo = vshrq_n_u32(vdst32_lo, 5); + vdst32_hi = vshrq_n_u32(vdst32_hi, 5); + + uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); + uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); + uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); + vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); + vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); + uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); + + vst1q_u16(device, vcombine_u16(vdst16_lo, vdst16_hi)); + device += 8; + height -= 8; + } + } + while (height != 0){ + uint32_t dst32 = SkExpand_rgb_16(*device) * scale; + *device++ = SkCompact_rgb_16((src32 + dst32) >> 5); + height--; + } +} + @@ -0,0 +1,5 @@ + +extern void SkRGB16_Opaque_Blitter_blitV_c(uint16_t* device, size_t deviceRB, int height, uint8_t alpha); +extern void SkRGB16_Opaque_Blitter_blitV_neon(uint16_t* device, size_t deviceRB, int height, uint8_t alpha); +extern void SkRGB16_Opaque_Blitter_blitH_c(uint16_t* device, int height, uint8_t alpha); +extern void SkRGB16_Opaque_Blitter_blitH_neon(uint16_t* device, int height, uint8_t alpha); @@ -0,0 +1,14 @@ +#/bin/bash +#gcc_path="/home/yang01/software/google/ndk/android-ndk-r10e/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-gcc" +gcc_path="/data/armv8/lollipop/prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin/aarch64-linux-android-gcc" +#sysroot_path="/home/yang01/software/google/ndk/android-ndk-r10e//platforms/android-21/arch-arm64/" +sysroot_path="/data/armv8/lollipop/prebuilts/ndk/9/platforms/android-21/arch-arm64/" + +${gcc_path} -O2 --sysroot=${sysroot_path} -pie -o blit.c.o -c blit.c +#${gcc_path} -O2 --sysroot=${sysroot_path} -pie -o blit_neon.c.o -c blit_neon.c +${gcc_path} -O2 --sysroot=${sysroot_path} -pie -o timer.c.o -c timer.c +${gcc_path} -O2 --sysroot=${sysroot_path} -pie -o test_blit.c.o -c test_blit.c +${gcc_path} -O2 --sysroot=${sysroot_path} -pie blit.c.o timer.c.o test_blit.c.o -o test_blit + +adb push test_blit /data/local/tmp/ +adb shell /data/local/tmp/test_blit diff --git a/test_blit.c b/test_blit.c new file mode 100644 index 0000000..cb63d3e --- /dev/null +++ b/test_blit.c @@ -0,0 +1,219 @@ +#include<stdio.h> +#include<stdlib.h> +#include<stddef.h> +#include<stdint.h> +#include<arm_neon.h> + +#include "timer.h" +#include "blit.h" + +#define TEST_LENGTH_SAMPLES (10000) + +#define TEST_COUNT 500000 + +static uint16_t testInput_u16[TEST_LENGTH_SAMPLES]; + +//input and output +static uint16_t* in0 = NULL; +static uint16_t* in1 = NULL; +static uint16_t* in2 = NULL; +static uint16_t* in3 = NULL; + +static double time0 = 0; +static double time1 = 0; + +void test_blitV_conformance() +{ + + int i = 0; + int height = 0; + unsigned int sse0, sse1; + + + /* init input memory */ + in0 = (uint16_t*) malloc ( (TEST_LENGTH_SAMPLES) * sizeof (*in0)); + in1 = (uint16_t*) malloc ( (TEST_LENGTH_SAMPLES) * sizeof (*in1)); + + for (i = 0; i < TEST_LENGTH_SAMPLES; i++) + { + testInput_u16[i] = (uint16_t) i; //(rand()&0xffff) ; + } + fprintf (stdout, "-----------------------------------------------------------\n"); + fprintf (stdout, "-------------------------------------------CONF TEST\n"); + fprintf (stdout, "-----------------------------------------------------------\n"); + + for (height = 1; height <= 128; height += 1) + { + printf ("height %d\n", height); + memcpy (in0, testInput_u16, TEST_LENGTH_SAMPLES * sizeof (*in0)); + memcpy (in1, testInput_u16, TEST_LENGTH_SAMPLES * sizeof (*in1)); + for (i = 0; i < TEST_LENGTH_SAMPLES; i++) + { + if (in0[i]!=in1[i]) + printf ("pre i %6d: %10d %10d \n", i, in0[i], in1[i]); + } + SkRGB16_Opaque_Blitter_blitV_c(in0, 4, height, 25); + SkRGB16_Opaque_Blitter_blitV_neon(in1, 4, height, 25); + + for (i = 0; i < TEST_LENGTH_SAMPLES; i++) + { + if (in0[i]!=in1[i]) + printf ("pos i %6d: %10d %10d \n", i, in0[i], in1[i]); + } + } + free (in0); + free (in1); +} + + +void test_blitV_performance() +{ + + int i = 0; + int config[] = {1, 8, 18, 32, 76, 85, 120, 128, 512}; + int num = 0; + int height = 0; + int loop = 0; + unsigned int sse0, sse1; + + /* init input memory */ + in0 = (uint16_t*) malloc ( (TEST_LENGTH_SAMPLES) * sizeof (*in0)); + in1 = (uint16_t*) malloc ( (TEST_LENGTH_SAMPLES) * sizeof (*in1)); + + for (i = 0; i < TEST_LENGTH_SAMPLES; i++) + { + testInput_u16[i] = (uint16_t) i; //(rand()&0xffff) ; + } + fprintf (stdout, "-----------------------------------------------------------\n"); + fprintf (stdout, "-------------------------------------blitV---------PERF TEST\n"); + fprintf (stdout, "-----------------------------------------------------------\n"); + + for (num = 0; num < sizeof(config)/sizeof(*config); num++) + { + loop = TEST_COUNT; + height = config[num]; + + /* C */ + memcpy (in0, testInput_u16, TEST_LENGTH_SAMPLES * sizeof (*in0)); + reset_timer(); + for (i = 0; i < loop ; i++) + SkRGB16_Opaque_Blitter_blitV_c(in0, 4, height, 25); + time0 = get_time(); + + /* NEON */ + memcpy (in1, testInput_u16, TEST_LENGTH_SAMPLES * sizeof (*in0)); + reset_timer(); + for (i = 0; i < loop ; i++) + SkRGB16_Opaque_Blitter_blitV_neon(in1, 4, height, 25); + time1 = get_time(); + + printf ("height %5d: %20lf %20lf\n", height, time0, time1); + printf ("ratio: %lf \n", time0/time1); + + } + + free (in0); + free (in1); +} +void test_blitH_conformance() +{ + + int i = 0; + int height = 0; + unsigned int sse0, sse1; + + + /* init input memory */ + in0 = (uint16_t*) malloc ( (TEST_LENGTH_SAMPLES) * sizeof (*in0)); + in1 = (uint16_t*) malloc ( (TEST_LENGTH_SAMPLES) * sizeof (*in1)); + + for (i = 0; i < TEST_LENGTH_SAMPLES; i++) + { + testInput_u16[i] = (uint16_t) i; //(rand()&0xffff) ; + } + fprintf (stdout, "-----------------------------------------------------------\n"); + fprintf (stdout, "-------------------------------------------CONF TEST\n"); + fprintf (stdout, "-----------------------------------------------------------\n"); + + for (height = 1; height <= 128; height += 1) + { + printf ("height %d\n", height); + memcpy (in0, testInput_u16, TEST_LENGTH_SAMPLES * sizeof (*in0)); + memcpy (in1, testInput_u16, TEST_LENGTH_SAMPLES * sizeof (*in1)); + for (i = 0; i < TEST_LENGTH_SAMPLES; i++) + { + if (in0[i]!=in1[i]) + printf ("pre i %6d: %10d %10d \n", i, in0[i], in1[i]); + } + SkRGB16_Opaque_Blitter_blitH_c(in0, height, 25); + SkRGB16_Opaque_Blitter_blitH_neon(in1, height, 25); + + for (i = 0; i < TEST_LENGTH_SAMPLES; i++) + { + if (in0[i]!=in1[i]) + printf ("pos i %6d: %10d %10d \n", i, in0[i], in1[i]); + } + } + free (in0); + free (in1); +} + + +void test_blitH_performance() +{ + + int i = 0; + int config[] = {1, 8, 18, 32, 76, 85, 120, 128, 512}; + int num = 0; + int height = 0; + int loop = 0; + unsigned int sse0, sse1; + + /* init input memory */ + in0 = (uint16_t*) malloc ( (TEST_LENGTH_SAMPLES) * sizeof (*in0)); + in1 = (uint16_t*) malloc ( (TEST_LENGTH_SAMPLES) * sizeof (*in1)); + + for (i = 0; i < TEST_LENGTH_SAMPLES; i++) + { + testInput_u16[i] = (uint16_t) i; //(rand()&0xffff) ; + } + fprintf (stdout, "-----------------------------------------------------------\n"); + fprintf (stdout, "--------------------------------------blitH--------PERF TEST\n"); + fprintf (stdout, "-----------------------------------------------------------\n"); + + for (num = 0; num < sizeof(config)/sizeof(*config); num++) + { + loop = TEST_COUNT; + height = config[num]; + + /* C */ + memcpy (in0, testInput_u16, TEST_LENGTH_SAMPLES * sizeof (*in0)); + reset_timer(); + for (i = 0; i < loop ; i++) + SkRGB16_Opaque_Blitter_blitH_c(in0, height, 25); + time0 = get_time(); + + /* NEON */ + memcpy (in1, testInput_u16, TEST_LENGTH_SAMPLES * sizeof (*in0)); + reset_timer(); + for (i = 0; i < loop ; i++) + SkRGB16_Opaque_Blitter_blitH_neon(in1, height, 25); + time1 = get_time(); + + printf ("height %5d: %20lf %20lf\n", height, time0, time1); + printf ("ratio: %lf \n", time0/time1); + + } + + free (in0); + free (in1); +} + + +void main (void) +{ + test_blitV_conformance(); // run tests + test_blitV_performance(); // run tests + test_blitH_conformance(); // run tests + test_blitH_performance(); // run tests +} @@ -0,0 +1,38 @@ +#include <sys/resource.h> +//#include <sys/time.h> +#include <time.h> +#include "timer.h" + +#define TIMER_TYPE CLOCK_PROCESS_CPUTIME_ID +struct timespec start_time; + +void reset_timer() +{ + clock_gettime(TIMER_TYPE, &start_time); +} + +double get_time() +{ + struct timespec t; + clock_gettime(TIMER_TYPE, &t); + return ((double)(t.tv_sec - start_time.tv_sec) + (double)(t.tv_nsec - start_time.tv_nsec) * 1.e-9); +} + +#if 0 +void GetUserTime(struct timeval* time) { + struct rusage usage; + getrusage(RUSAGE_SELF, &usage); + memcpy(time, &usage.ru_utime, sizeof(*time)); +} + +double TimeDifference(const struct timeval * start, + const struct timeval * end) { + double start_time; + double end_time; + start_time = start->tv_sec + start->tv_usec * 1e-6; + end_time = end->tv_sec + end->tv_usec * 1e-6; + + return end_time - start_time; +} + +#endif @@ -0,0 +1,10 @@ +#ifndef TIMER_H +#define TIMER_H + +//#define TIMER_TYPE CLOCK_REALTIME +#define TIMER_TYPE CLOCK_PROCESS_CPUTIME_ID + +void reset_timer(); +double get_time(); + +#endif |