Upgrade to the latest version of libvpx pulled from the webm project's git server.android-sdk-tools_r12 android-sdk-adt_r12 android-sdk-2.3.4_r1 android-cts-2.3_r5 android-cts-2.3_r4 android-cts-2.3_r3 android-cts-2.3_r2 android-cts-2.3_r12 android-cts-2.3_r11 android-cts-2.3_r10 android-2.3.7_r1 android-2.3.6_r1 android-2.3.6_r0.9 android-2.3.5_r1 android-2.3.4_r1 android-2.3.4_r0.9 android-2.3.3_r1a android-2.3.3_r1.1 android-2.3.3_r1 gingerbread-release gingerbread-mr4-release gingerbread

latest commit pulled from git://review.webmproject.org/libvpx.git: 4d1b0d2a2dff335baedd52bd7de09d55ec10b253 Change-Id: I51c08b1b7f80c5071f33ca2c715b3ad03c693113 related-to-bug: 3175974
author: Andreas Huber <andih@google.com> 2010-11-08 11:13:04 -0800
committer: Andreas Huber <andih@google.com> 2010-11-30 11:33:25 -0800
commit: 76e0247ec867fcc232fc79f21e9bf85d3c3a5a3f (patch)
tree: 7e5b3862358c88041828ddc70e63cbad9ff1a77d
parent: acb586e0536bb0ffc5b3f41dcdedf77fed3f02c5 (diff)
download: libvpx-gingerbread-mr4-release.tar.gz
245 files changed, 19312 insertions, 10810 deletions
diff --git a/AUTHORS b/AUTHORS
index 6708d5aa9..110e5e143 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,6 +1,7 @@
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
 
+Aaron Watry <awatry@gmail.com>
 Adrian Grange <agrange@google.com>
 Alex Converse <alex.converse@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
@@ -20,6 +21,7 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
+Martin Ettl <ettl.martin78@googlemail.com>
 Michael Kohler <michaelkohler@live.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
diff --git a/Android.mk b/Android.mk
index bf8e58d71..40209bdec 100644
--- a/Android.mk
+++ b/Android.mk
@@ -12,6 +12,8 @@ LOCAL_SRC_FILES = \
     vpx_scale/generic/scalesystemdependant.c \
     vpx_scale/generic/gen_scalers.c \
     vp8/common/alloccommon.c \
+    vp8/common/arm/arm_systemdependent.c \
+    vp8/common/arm/reconintra_arm.c \
     vp8/common/blockd.c \
     vp8/common/debugmodes.c \
     vp8/common/entropy.c \
@@ -41,6 +43,7 @@ LOCAL_SRC_FILES = \
     vp8/common/postproc.c \
     vp8/vp8_cx_iface.c \
     vp8/vp8_dx_iface.c \
+    vp8/decoder/arm/arm_dsystemdependent.c \
     vp8/decoder/dboolhuff.c \
     vp8/decoder/decodemv.c \
     vp8/decoder/decodframe.c \
@@ -48,6 +51,7 @@ LOCAL_SRC_FILES = \
     vp8/decoder/detokenize.c \
     vp8/decoder/generic/dsystemdependent.c \
     vp8/decoder/onyxd_if.c \
+    vp8/decoder/reconintra_mt.c \
     vp8/decoder/threading.c \
     vpx_config.c \
     vp8/decoder/arm/neon/idct_blk_neon.c
@@ -78,16 +82,8 @@ ASM_FILES = \
     vp8/common/arm/neon/copymem8x4_neon.s \
     vp8/common/arm/neon/copymem8x8_neon.s \
     vp8/common/arm/neon/iwalsh_neon.s \
-    vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.s \
-    vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.s \
     vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.s \
     vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.s \
-    vp8/common/arm/neon/loopfilterverticaledge_uv_neon.s \
-    vp8/common/arm/neon/loopfilterverticaledge_y_neon.s \
-    vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.s \
-    vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.s \
-    vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.s \
-    vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.s \
     vp8/common/arm/neon/recon16x16mb_neon.s \
     vp8/common/arm/neon/recon2b_neon.s \
     vp8/common/arm/neon/recon4b_neon.s \
@@ -101,8 +97,13 @@ ASM_FILES = \
     vp8/common/arm/neon/sixtappredict8x8_neon.s \
     vp8/common/arm/neon/dc_only_idct_add_neon.s \
     vp8/decoder/arm/neon/dequantizeb_neon.s \
-    vp8/decoder/arm/neon/dequant_dc_idct_neon.s \
     vp8/decoder/arm/neon/dequant_idct_neon.s \
+    vp8/decoder/arm/neon/idct_dequant_0_2x_neon.s \
+    vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.s \
+    vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.s \
+    vp8/decoder/arm/neon/idct_dequant_full_2x_neon.s \
+    vp8/common/arm/neon/loopfilter_neon.s \
+    vp8/common/arm/neon/mbloopfilter_neon.s \
 
 # All the assembly sources must be converted from ADS to GAS compatible format
 VPX_GEN := $(addprefix $(intermediates)/, $(ASM_FILES))
diff --git a/CHANGELOG b/CHANGELOG
index 2b2803740..b8da8f8e3 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,111 @@
+2010-10-28 v0.9.5 "Aylesbury"
+  Our first named release, focused on a faster decoder, and a better encoder.
+
+  - Upgrading:
+    This release incorporates backwards-incompatible changes to the
+    ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec.
+
+    vpxdec
+      * the -q (quiet) option has been removed, and replaced with
+        -v (verbose). the output is quiet by default. Use -v to see
+        the version number of the binary.
+
+      * The default behavior is now to write output to a single file
+        instead of individual frames. The -y option has been removed.
+        Y4M output is the default.
+
+      * For raw I420/YV12 output instead of Y4M, the --i420 or --yv12
+        options must be specified.
+
+          $ ivfdec -o OUTPUT INPUT
+          $ vpxdec --i420 -o OUTPUT INPUT
+
+      * If an output file is not specified, the default is to write
+        Y4M to stdout. This makes piping more natural.
+
+          $ ivfdec -y -o - INPUT | ...
+          $ vpxdec INPUT | ...
+
+      * The output file has additional flexibility for formatting the
+        filename. It supports escape characters for constructing a
+        filename from the width, height, and sequence number. This
+        replaces the -p option. To get the equivalent:
+
+          $ ivfdec -p frame INPUT
+          $ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT
+
+    vpxenc
+      * The output file must be specified with -o, rather than as the
+        last argument.
+
+          $ ivfenc <options> INPUT OUTPUT
+          $ vpxenc <options> -o OUTPUT INPUT
+
+      * The output defaults to webm. To get IVF output, use the --ivf
+        option.
+
+          $ ivfenc <options> INPUT OUTPUT.ivf
+          $ vpxenc <options> -o OUTPUT.ivf --ivf INPUT
+
+
+  - Enhancements:
+      ivfenc and ivfdec have been renamed to vpxenc, vpxdec.
+      vpxdec supports .webm input
+      vpxdec writes .y4m by default
+      vpxenc writes .webm output by default
+      vpxenc --psnr now shows the average/overall PSNR at the end
+      ARM platforms now support runtime cpu detection
+      vpxdec visualizations added for motion vectors, block modes, references
+      vpxdec now silent by default
+      vpxdec --progress shows frame-by-frame timing information
+      vpxenc supports the distinction between --fps and --timebase
+      NASM is now a supported assembler
+      configure: enable PIC for shared libs by default
+      configure: add --enable-small
+      configure: support for ppc32-linux-gcc
+      configure: support for sparc-solaris-gcc
+
+  - Bugs:
+      Improve handling of invalid frames
+      Fix valgrind errors in the NEON loop filters.
+      Fix loopfilter delta zero transitions
+      Fix valgrind errors in vp8_sixtap_predict8x4_armv6().
+      Build fixes for darwin-icc
+
+  - Speed:
+      20-40% (average 28%) improvement in libvpx decoder speed,
+      including:
+        Rewrite vp8_short_walsh4x4_sse2()
+        Optimizations on the loopfilters.
+        Miscellaneous improvements for Atom
+        Add 4-tap version of 2nd-pass ARMv6 MC filter.
+        Improved multithread utilization
+        Better instruction choices on x86
+        reorder data to use wider instructions
+        Update NEON wide idcts
+        Make block access to frame buffer sequential
+        Improved subset block search
+        Bilinear subpixel optimizations for ssse3.
+        Decrease memory footprint
+
+      Encoder speed improvements (percentage gain not measured):
+        Skip unnecessary search of identical frames
+        Add SSE2 subtract functions
+        Improve bounds checking in vp8_diamond_search_sadx4()
+        Added vp8_fast_quantize_b_sse2
+
+  - Quality:
+      Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality
+      encoding mode, and up to 60% improvement on very noisy, still
+      or slow moving source video
+
+        Motion compensated temporal filter for Alt-Ref Noise Reduction
+        Improved use of trellis quantization on 2nd order Y blocks
+        Tune effect of motion on KF/GF boost in two pass
+        Allow coefficient optimization for good quality speed 0.
+        Improved control of active min quantizer for two pass.
+        Enable ARFs for non-lagged compress
+
 2010-09-02 v0.9.2
   - Enhancements:
       Disable frame dropping by default
diff --git a/README b/README
index f0625d3d7..c1a76687f 100644
--- a/README
+++ b/README
@@ -89,7 +89,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   toolchain, the following command could be used (note, POSIX SH syntax, adapt
   to your shell as necessary):
 
-    $ CROSS=mipsel-linux-uclibc- ../libvpx/src/configure
+    $ CROSS=mipsel-linux-uclibc- ../libvpx/configure
 
   In addition, the executables to be invoked can be overridden by specifying the
   environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
diff --git a/args.c b/args.c
index 5365e9120..782929022 100644
--- a/args.c
+++ b/args.c
@@ -120,9 +120,13 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs)
         char *long_val = def->has_val ? "=<arg>" : "";
 
         if (def->short_name && def->long_name)
-            snprintf(option_text, 37, "-%s%s, --%s%s",
-                     def->short_name, short_val,
+        {
+            char *comma = def->has_val ? "," : ",      ";
+
+            snprintf(option_text, 37, "-%s%s%s --%s%6s",
+                     def->short_name, short_val, comma,
                      def->long_name, long_val);
+        }
         else if (def->short_name)
             snprintf(option_text, 37, "-%s%s",
                      def->short_name, short_val);
diff --git a/build/.gitattributes b/build/.gitattributes
deleted file mode 100644
index 03db79bc0..000000000
--- a/build/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-*-vs8/*.rules -crlf
-*-msvs/*.rules -crlf
diff --git a/build/.gitignore b/build/.gitignore
deleted file mode 100644
index 1350fcb5e..000000000
--- a/build/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-x86*-win32-vs*
diff --git a/build/make/Makefile b/build/make/Makefile
index 1ca747a26..40fa6d50c 100755
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -65,7 +65,7 @@ endif
 BUILD_ROOT?=.
 VPATH=$(SRC_PATH_BARE)
 CFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT) -I$(SRC_PATH)
-ASFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT) -I$(SRC_PATH)
+ASFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT)/ -I$(SRC_PATH)/
 DIST_DIR?=dist
 HOSTCC?=gcc
 TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
diff --git a/build/make/configure.sh b/build/make/configure.sh
index e9f8a2b9c..d25d6400e 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -255,9 +255,10 @@ TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
 TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
 TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
 TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"
 
 clean_temp_files() {
-    rm -f ${TMP_C} ${TMP_H} ${TMP_O} ${TMP_X}
+    rm -f ${TMP_C} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
 }
 
 #
@@ -322,6 +323,21 @@ check_add_ldflags() {
     add_ldflags "$@"
 }
 
+check_asm_align() {
+    log check_asm_align "$@"
+    cat >${TMP_ASM} <<EOF
+section .rodata
+align 16
+EOF
+    log_file ${TMP_ASM}
+    check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
+    readelf -WS ${TMP_O} >${TMP_X}
+    log_file ${TMP_X}
+    if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
+        die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
+    fi
+}
+
 write_common_config_banner() {
     echo '# This file automatically generated by configure. Do not edit!' > config.mk
     echo "TOOLCHAIN := ${toolchain}" >> config.mk
@@ -440,13 +456,18 @@ process_common_cmdline() {
         disable builtin_libc
         alt_libc="${optval}"
         ;;
+        --as=*)
+        [ "${optval}" = yasm -o "${optval}" = nasm -o "${optval}" = auto ] \
+            || die "Must be yasm, nasm or auto: ${optval}"
+        alt_as="${optval}"
+        ;;
         --prefix=*)
         prefix="${optval}"
         ;;
         --libdir=*)
         libdir="${optval}"
         ;;
-        --libc|--prefix|--libdir)
+        --libc|--as|--prefix|--libdir)
         die "Option ${opt} requires argument"
         ;;
         --help|-h) show_help
@@ -505,6 +526,15 @@ process_common_toolchain() {
             *i[3456]86*)
                 tgt_isa=x86
                 ;;
+            *powerpc64*)
+                tgt_isa=ppc64
+                ;;
+            *powerpc*)
+                tgt_isa=ppc32
+                ;;
+            *sparc*)
+                tgt_isa=sparc
+                ;;
         esac
 
         # detect tgt_os
@@ -524,6 +554,9 @@ process_common_toolchain() {
             *linux*|*bsd*)
                 tgt_os=linux
                 ;;
+            *solaris2.10)
+                tgt_os=solaris
+                ;;
         esac
 
         if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
@@ -556,15 +589,18 @@ process_common_toolchain() {
     mips*)        enable mips;;
     esac
 
+    # PIC is probably what we want when building shared libs
+    enabled shared && soft_enable pic
+
     # Handle darwin variants
     case ${toolchain} in
-        *-darwin8-gcc)
+        *-darwin8-*)
             add_cflags  "-isysroot /Developer/SDKs/MacOSX10.4u.sdk"
             add_cflags  "-mmacosx-version-min=10.4"
             add_ldflags "-isysroot /Developer/SDKs/MacOSX10.4u.sdk"
             add_ldflags "-mmacosx-version-min=10.4"
             ;;
-        *-darwin9-gcc)
+        *-darwin9-*)
             add_cflags  "-isysroot /Developer/SDKs/MacOSX10.5.sdk"
             add_cflags  "-mmacosx-version-min=10.5"
             add_ldflags "-isysroot /Developer/SDKs/MacOSX10.5.sdk"
@@ -572,6 +608,13 @@ process_common_toolchain() {
             ;;
     esac
 
+    # Handle Solaris variants. Solaris 10 needs -lposix4
+    case ${toolchain} in
+        *-solaris-*)
+            add_extralibs -lposix4
+            ;;
+    esac
+
     # Process ARM architecture variants
     case ${toolchain} in
     arm*|iwmmxt*)
@@ -755,8 +798,8 @@ process_common_toolchain() {
         link_with_cc=gcc
         setup_gnu_toolchain
         add_asflags -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-        add_cflags -maltivec -faltivec
         soft_enable altivec
+        enabled altivec && add_cflags -maltivec
 
         case "$tgt_os" in
         linux*)
@@ -768,6 +811,7 @@ process_common_toolchain() {
             add_cflags  ${darwin_arch} -m${bits} -fasm-blocks
             add_asflags ${darwin_arch} -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
             add_ldflags ${darwin_arch} -m${bits}
+            enabled altivec && add_cflags -faltivec
         ;;
         esac
     ;;
@@ -780,6 +824,7 @@ process_common_toolchain() {
         soft_enable sse2
         soft_enable sse3
         soft_enable ssse3
+        soft_enable sse4_1
 
         case  ${tgt_os} in
             win*)
@@ -792,6 +837,7 @@ process_common_toolchain() {
                 ;;
         esac
 
+        AS="${alt_as:-${AS:-auto}}"
         case  ${tgt_cc} in
             icc*)
                 CC=${CC:-icc}
@@ -820,7 +866,16 @@ process_common_toolchain() {
                 ;;
         esac
 
-        AS=yasm
+        case "${AS}" in
+            auto|"")
+                which nasm >/dev/null 2>&1 && AS=nasm
+                which yasm >/dev/null 2>&1 && AS=yasm
+                [ "${AS}" = auto -o -z "${AS}" ] \
+                    && die "Neither yasm nor nasm have been found"
+                ;;
+        esac
+        log_echo "  using $AS"
+        [ "${AS##*/}" = nasm ] && add_asflags -Ox
         AS_SFX=.asm
         case  ${tgt_os} in
             win*)
@@ -829,7 +884,9 @@ process_common_toolchain() {
             ;;
             linux*|solaris*)
                 add_asflags -f elf${bits}
-                enabled debug && add_asflags -g dwarf2
+                enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
+                enabled debug && [ "${AS}" = nasm ] && add_asflags -g
+                [ "${AS##*/}" = nasm ] && check_asm_align
             ;;
             darwin*)
                 add_asflags -f macho${bits}
@@ -842,7 +899,7 @@ process_common_toolchain() {
                 # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
                 enabled icc && ! enabled pic && add_cflags -fno-pic
             ;;
-            *) log "Warning: Unknown os $tgt_os while setting up yasm flags"
+            *) log "Warning: Unknown os $tgt_os while setting up $AS flags"
             ;;
         esac
     ;;
@@ -873,9 +930,9 @@ process_common_toolchain() {
     enabled gcov &&
         check_add_cflags -fprofile-arcs -ftest-coverage &&
         check_add_ldflags -fprofile-arcs -ftest-coverage
-    enabled optimizations && check_add_cflags -O3
-    if enabled rvct; then
-        enabled optimizations && check_add_cflags -Otime
+    if enabled optimizations; then
+        enabled rvct && check_add_cflags -Otime
+        enabled small && check_add_cflags -O2 || check_add_cflags -O3
     fi
 
     # Position Independant Code (PIC) support, for building relocatable
diff --git a/configure b/configure
index 5d6964e09..11e086e9c 100755
--- a/configure
+++ b/configure
@@ -23,6 +23,7 @@ Advanced options:
   ${toggle_libs}                  don't build libraries
   ${toggle_examples}              don't build examples
   --libc=PATH                     path to alternate libc
+  --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
   ${toggle_fast_unaligned}        don't use unaligned accesses, even when
                                   supported by hardware [auto]
   ${toggle_codec_srcs}            in/exclude codec library source code
@@ -38,6 +39,7 @@ Advanced options:
   ${toggle_realtime_only}         enable this option while building for real-time encoding
   ${toggle_runtime_cpu_detect}    runtime cpu detection
   ${toggle_shared}                shared library support
+  ${toggle_small}                 favor smaller size over speed
   ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
 
 Codecs:
@@ -95,9 +97,11 @@ all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
+all_platforms="${all_platforms} ppc32-linux-gcc"
 all_platforms="${all_platforms} ppc64-darwin8-gcc"
 all_platforms="${all_platforms} ppc64-darwin9-gcc"
 all_platforms="${all_platforms} ppc64-linux-gcc"
+all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
 all_platforms="${all_platforms} x86-darwin8-icc"
 all_platforms="${all_platforms} x86-darwin9-gcc"
@@ -195,6 +199,7 @@ ARCH_EXT_LIST="
     sse2
     sse3
     ssse3
+    sse4_1
 
     altivec
 "
@@ -243,6 +248,7 @@ CONFIG_LIST="
     spatial_resampling
     realtime_only
     shared
+    small
     arm_asm_detok
 "
 CMDLINE_SELECT="
@@ -263,6 +269,7 @@ CMDLINE_SELECT="
     libs
     examples
     libc
+    as
     fast_unaligned
     codec_srcs
     debug_libs
@@ -280,6 +287,7 @@ CMDLINE_SELECT="
     spatial_resampling
     realtime_only
     shared
+    small
     arm_asm_detok
 "
 
diff --git a/examples.mk b/examples.mk
index 00ffc7037..a30205d31 100644
--- a/examples.mk
+++ b/examples.mk
@@ -12,19 +12,40 @@
 # List of examples to build. UTILS are files that are taken from the source
 # tree directly, and GEN_EXAMPLES are files that are created from the
 # examples folder.
-UTILS-$(CONFIG_DECODERS)    += ivfdec.c
-ivfdec.SRCS                 += md5_utils.c md5_utils.h
-ivfdec.SRCS                 += vpx_ports/vpx_timer.h
-ivfdec.SRCS                 += vpx/vpx_integer.h
-ivfdec.SRCS                 += args.c args.h vpx_ports/config.h
-ivfdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
-ivfdec.DESCRIPTION           = Full featured decoder
-UTILS-$(CONFIG_ENCODERS)    += ivfenc.c
-ivfenc.SRCS                 += args.c args.h y4minput.c y4minput.h
-ivfenc.SRCS                 += vpx_ports/config.h vpx_ports/mem_ops.h
-ivfenc.SRCS                 += vpx_ports/mem_ops_aligned.h
-ivfenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
-ivfenc.DESCRIPTION           = Full featured encoder
+UTILS-$(CONFIG_DECODERS)    += vpxdec.c
+vpxdec.SRCS                 += md5_utils.c md5_utils.h
+vpxdec.SRCS                 += vpx_ports/vpx_timer.h
+vpxdec.SRCS                 += vpx/vpx_integer.h
+vpxdec.SRCS                 += args.c args.h vpx_ports/config.h
+vpxdec.SRCS                 += tools_common.c tools_common.h
+vpxdec.SRCS                 += nestegg/halloc/halloc.h
+vpxdec.SRCS                 += nestegg/halloc/src/align.h
+vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
+vpxdec.SRCS                 += nestegg/halloc/src/hlist.h
+vpxdec.SRCS                 += nestegg/halloc/src/macros.h
+vpxdec.SRCS                 += nestegg/include/nestegg/nestegg.h
+vpxdec.SRCS                 += nestegg/src/nestegg.c
+vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
+vpxdec.DESCRIPTION           = Full featured decoder
+UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
+vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
+vpxenc.SRCS                 += tools_common.c tools_common.h
+vpxenc.SRCS                 += vpx_ports/config.h vpx_ports/mem_ops.h
+vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxenc.SRCS                 += libmkv/EbmlIDs.h
+vpxenc.SRCS                 += libmkv/EbmlWriter.c
+vpxenc.SRCS                 += libmkv/EbmlWriter.h
+vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
+vpxenc.DESCRIPTION           = Full featured encoder
+
+# Clean up old ivfenc, ivfdec binaries.
+ifeq ($(CONFIG_MSVS),yes)
+CLEAN-OBJS += $(foreach p,$(VS_PLATFORMS),$(p)/Release/ivfenc.exe)
+CLEAN-OBJS += $(foreach p,$(VS_PLATFORMS),$(p)/Release/ivfdec.exe)
+else
+CLEAN-OBJS += ivfenc{.c.o,.c.d,.dox,.exe,}
+CLEAN-OBJS += ivfdec{.c.o,.c.d,.dox,.exe,}
+endif
 
 # XMA example disabled for now, not used in VP8
 #UTILS-$(CONFIG_DECODERS)    += example_xma.c
diff --git a/examples/decoder_tmpl.c b/examples/decoder_tmpl.c
index ba3ac987f..26b745d34 100644
--- a/examples/decoder_tmpl.c
+++ b/examples/decoder_tmpl.c
@@ -61,8 +61,8 @@ int main(int argc, char **argv) {
         die("Failed to open %s for writing", argv[2]);
 
     /* Read file header */
-    fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile);
-    if(!(file_hdr[0]=='D' && file_hdr[1]=='K' && file_hdr[2]=='I'
+    if(!(fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile) == IVF_FILE_HDR_SZ
+         && file_hdr[0]=='D' && file_hdr[1]=='K' && file_hdr[2]=='I'
          && file_hdr[3]=='F'))
         die("%s is not an IVF file.", argv[1]);
 
diff --git a/examples/decoder_tmpl.txt b/examples/decoder_tmpl.txt
index 6da38c2a8..310c66d54 100644
--- a/examples/decoder_tmpl.txt
+++ b/examples/decoder_tmpl.txt
@@ -48,7 +48,7 @@ for(plane=0; plane < 3; plane++) {
     unsigned char *buf =img->planes[plane];
 
     for(y=0; y<img->d_h >> (plane?1:0); y++) {
-        fwrite(buf, 1, img->d_w >> (plane?1:0), outfile);
+        if(fwrite(buf, 1, img->d_w >> (plane?1:0), outfile));
         buf += img->stride[plane];
     }
 }
diff --git a/examples/encoder_tmpl.c b/examples/encoder_tmpl.c
index fdfc3af8f..d9e4d0317 100644
--- a/examples/encoder_tmpl.c
+++ b/examples/encoder_tmpl.c
@@ -85,7 +85,7 @@ static void write_ivf_file_header(FILE *outfile,
     mem_put_le32(header+24, frame_cnt);           /* length */
     mem_put_le32(header+28, 0);                   /* unused */
 
-    fwrite(header, 1, 32, outfile);
+    if(fwrite(header, 1, 32, outfile));
 }
 
 
@@ -103,7 +103,7 @@ static void write_ivf_frame_header(FILE *outfile,
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 
-    fwrite(header, 1, 12, outfile);
+    if(fwrite(header, 1, 12, outfile));
 }
 
 int main(int argc, char **argv) {
diff --git a/examples/encoder_tmpl.txt b/examples/encoder_tmpl.txt
index 87055ca13..3273164da 100644
--- a/examples/encoder_tmpl.txt
+++ b/examples/encoder_tmpl.txt
@@ -61,8 +61,8 @@ if(vpx_codec_encode(&codec, frame_avail? &raw : NULL, frame_cnt,
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_FRAME
 case VPX_CODEC_CX_FRAME_PKT:
     write_ivf_frame_header(outfile, pkt);
-    fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-           outfile);
+    if(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+              outfile));
     break;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_FRAME
 
diff --git a/examples/includes/HTML-Toc-0.91/.gitattributes b/examples/includes/HTML-Toc-0.91/.gitattributes
deleted file mode 100644
index aecf25037..000000000
--- a/examples/includes/HTML-Toc-0.91/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-* -crlf
diff --git a/ivfdec.c b/ivfdec.c
deleted file mode 100644
index 3919d6bb2..000000000
--- a/ivfdec.c
+++ /dev/null
@@ -1,640 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This is a simple program that reads ivf files and decodes them
- * using the new interface. Decoded frames are output as YV12 raw.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx_config.h"
-#include "vpx/vpx_decoder.h"
-#include "vpx_ports/vpx_timer.h"
-#if CONFIG_VP8_DECODER
-#include "vpx/vp8dx.h"
-#endif
-#if CONFIG_MD5
-#include "md5_utils.h"
-#endif
-
-static const char *exec_name;
-
-static const struct
-{
-    char const *name;
-    const vpx_codec_iface_t *iface;
-    unsigned int             fourcc;
-    unsigned int             fourcc_mask;
-} ifaces[] =
-{
-#if CONFIG_VP8_DECODER
-    {"vp8",  &vpx_codec_vp8_dx_algo,   0x00385056, 0x00FFFFFF},
-#endif
-};
-
-#include "args.h"
-static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1,
-                                  "Codec to use");
-static const arg_def_t prefixarg = ARG_DEF("p", "prefix", 1,
-                                   "Prefix to use when saving frames");
-static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0,
-                                  "Output file is YV12 ");
-static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0,
-                                  "Output file is I420 (default)");
-static const arg_def_t flipuvarg = ARG_DEF(NULL, "flipuv", 0,
-                                   "Synonym for --yv12");
-static const arg_def_t noblitarg = ARG_DEF(NULL, "noblit", 0,
-                                   "Don't process the decoded frames");
-static const arg_def_t progressarg = ARG_DEF(NULL, "progress", 0,
-                                     "Show progress after each frame decodes");
-static const arg_def_t limitarg = ARG_DEF(NULL, "limit", 1,
-                                  "Stop decoding after n frames");
-static const arg_def_t postprocarg = ARG_DEF(NULL, "postproc", 0,
-                                     "Postprocess decoded frames");
-static const arg_def_t summaryarg = ARG_DEF(NULL, "summary", 0,
-                                    "Show timing summary");
-static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
-                                    "Output raw yv12 file instead of images");
-static const arg_def_t usey4marg = ARG_DEF("y", "y4m", 0,
-                                    "Output file is YUV4MPEG2");
-static const arg_def_t threadsarg = ARG_DEF("t", "threads", 1,
-                                    "Max threads to use");
-static const arg_def_t quietarg = ARG_DEF("q", "quiet", 0,
-                                  "Suppress version string");
-
-#if CONFIG_MD5
-static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
-                                        "Compute the MD5 sum of the decoded frame");
-#endif
-static const arg_def_t *all_args[] =
-{
-    &codecarg, &prefixarg, &use_yv12, &use_i420, &flipuvarg, &noblitarg,
-    &progressarg, &limitarg, &postprocarg, &summaryarg, &outputfile,
-    &usey4marg, &threadsarg, &quietarg,
-#if CONFIG_MD5
-    &md5arg,
-#endif
-    NULL
-};
-
-#if CONFIG_VP8_DECODER
-static const arg_def_t addnoise_level = ARG_DEF(NULL, "noise-level", 1,
-                                        "Enable VP8 postproc add noise");
-static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0,
-                                 "Enable VP8 deblocking");
-static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level", 1,
-        "Enable VP8 demacroblocking, w/ level");
-static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
-                                       "Enable VP8 visible debug info");
-
-
-static const arg_def_t *vp8_pp_args[] =
-{
-    &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
-    NULL
-};
-#endif
-
-static void usage_exit()
-{
-    int i;
-
-    fprintf(stderr, "Usage: %s <options> filename\n\n"
-            "Options:\n", exec_name);
-    arg_show_usage(stderr, all_args);
-#if CONFIG_VP8_DECODER
-    fprintf(stderr, "\nvp8 Postprocessing Options:\n");
-    arg_show_usage(stderr, vp8_pp_args);
-#endif
-    fprintf(stderr, "\nIncluded decoders:\n\n");
-
-    for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
-        fprintf(stderr, "    %-6s - %s\n",
-                ifaces[i].name,
-                vpx_codec_iface_name(ifaces[i].iface));
-
-    exit(EXIT_FAILURE);
-}
-
-void die(const char *fmt, ...)
-{
-    va_list ap;
-    va_start(ap, fmt);
-    vfprintf(stderr, fmt, ap);
-    fprintf(stderr, "\n");
-    usage_exit();
-}
-
-static unsigned int mem_get_le16(const void *vmem)
-{
-    unsigned int  val;
-    const unsigned char *mem = (const unsigned char *)vmem;
-
-    val = mem[1] << 8;
-    val |= mem[0];
-    return val;
-}
-
-static unsigned int mem_get_le32(const void *vmem)
-{
-    unsigned int  val;
-    const unsigned char *mem = (const unsigned char *)vmem;
-
-    val = mem[3] << 24;
-    val |= mem[2] << 16;
-    val |= mem[1] << 8;
-    val |= mem[0];
-    return val;
-}
-
-#define IVF_FRAME_HDR_SZ (sizeof(uint32_t) + sizeof(uint64_t))
-#define RAW_FRAME_HDR_SZ (sizeof(uint32_t))
-static int read_frame(FILE                  *infile,
-                      uint8_t               **buf,
-                      uint32_t              *buf_sz,
-                      uint32_t              *buf_alloc_sz,
-                      int                    is_ivf)
-{
-    char     raw_hdr[IVF_FRAME_HDR_SZ];
-    uint32_t new_buf_sz;
-
-    /* For both the raw and ivf formats, the frame size is the first 4 bytes
-     * of the frame header. We just need to special case on the header
-     * size.
-     */
-    if (fread(raw_hdr, is_ivf ? IVF_FRAME_HDR_SZ : RAW_FRAME_HDR_SZ, 1,
-              infile) != 1)
-    {
-        if (!feof(infile))
-            fprintf(stderr, "Failed to read frame size\n");
-
-        new_buf_sz = 0;
-    }
-    else
-    {
-        new_buf_sz = mem_get_le32(raw_hdr);
-
-        if (new_buf_sz > 256 * 1024 * 1024)
-        {
-            fprintf(stderr, "Error: Read invalid frame size (%u)\n",
-                    new_buf_sz);
-            new_buf_sz = 0;
-        }
-
-        if (!is_ivf && new_buf_sz > 256 * 1024)
-            fprintf(stderr, "Warning: Read invalid frame size (%u)"
-                    " - not a raw file?\n", new_buf_sz);
-
-        if (new_buf_sz > *buf_alloc_sz)
-        {
-            uint8_t *new_buf = realloc(*buf, 2 * new_buf_sz);
-
-            if (new_buf)
-            {
-                *buf = new_buf;
-                *buf_alloc_sz = 2 * new_buf_sz;
-            }
-            else
-            {
-                fprintf(stderr, "Failed to allocate compressed data buffer\n");
-                new_buf_sz = 0;
-            }
-        }
-    }
-
-    *buf_sz = new_buf_sz;
-
-    if (*buf_sz)
-    {
-        if (fread(*buf, 1, *buf_sz, infile) != *buf_sz)
-        {
-            fprintf(stderr, "Failed to read full frame\n");
-            return 1;
-        }
-
-        return 0;
-    }
-
-    return 1;
-}
-
-void *out_open(const char *out_fn, int do_md5)
-{
-    void *out = NULL;
-
-    if (do_md5)
-    {
-#if CONFIG_MD5
-        MD5Context *md5_ctx = out = malloc(sizeof(MD5Context));
-        (void)out_fn;
-        MD5Init(md5_ctx);
-#endif
-    }
-    else
-    {
-        FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb") : stdout;
-
-        if (!outfile)
-        {
-            fprintf(stderr, "Failed to output file");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return out;
-}
-
-void out_put(void *out, const uint8_t *buf, unsigned int len, int do_md5)
-{
-    if (do_md5)
-    {
-#if CONFIG_MD5
-        MD5Update(out, buf, len);
-#endif
-    }
-    else
-    {
-        fwrite(buf, 1, len, out);
-    }
-}
-
-void out_close(void *out, const char *out_fn, int do_md5)
-{
-    if (do_md5)
-    {
-#if CONFIG_MD5
-        uint8_t md5[16];
-        int i;
-
-        MD5Final(md5, out);
-        free(out);
-
-        for (i = 0; i < 16; i++)
-            printf("%02x", md5[i]);
-
-        printf("  %s\n", out_fn);
-#endif
-    }
-    else
-    {
-        fclose(out);
-    }
-}
-
-unsigned int file_is_ivf(FILE *infile,
-                         unsigned int *fourcc,
-                         unsigned int *width,
-                         unsigned int *height,
-                         unsigned int *timebase_num,
-                         unsigned int *timebase_den)
-{
-    char raw_hdr[32];
-    int is_ivf = 0;
-
-    if (fread(raw_hdr, 1, 32, infile) == 32)
-    {
-        if (raw_hdr[0] == 'D' && raw_hdr[1] == 'K'
-            && raw_hdr[2] == 'I' && raw_hdr[3] == 'F')
-        {
-            is_ivf = 1;
-
-            if (mem_get_le16(raw_hdr + 4) != 0)
-                fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
-                        " decode properly.");
-
-            *fourcc = mem_get_le32(raw_hdr + 8);
-            *width = mem_get_le16(raw_hdr + 12);
-            *height = mem_get_le16(raw_hdr + 14);
-            *timebase_den = mem_get_le32(raw_hdr + 16);
-            *timebase_num = mem_get_le32(raw_hdr + 20);
-        }
-    }
-
-    if (!is_ivf)
-        rewind(infile);
-
-    return is_ivf;
-}
-
-int main(int argc, const char **argv_)
-{
-    vpx_codec_ctx_t          decoder;
-    char                  *prefix = NULL, *fn = NULL;
-    int                    i;
-    uint8_t               *buf = NULL;
-    uint32_t               buf_sz = 0, buf_alloc_sz = 0;
-    FILE                  *infile;
-    int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0, do_md5 = 0, progress = 0;
-    int                    stop_after = 0, postproc = 0, summary = 0, quiet = 0;
-    vpx_codec_iface_t       *iface = NULL;
-    unsigned int           is_ivf, fourcc;
-    unsigned long          dx_time = 0;
-    struct arg               arg;
-    char                   **argv, **argi, **argj;
-    const char                   *fn2 = 0;
-    int                     use_y4m = 0;
-    unsigned int            width;
-    unsigned int            height;
-    unsigned int            timebase_num;
-    unsigned int            timebase_den;
-    void                   *out = NULL;
-    vpx_codec_dec_cfg_t     cfg = {0};
-#if CONFIG_VP8_DECODER
-    vp8_postproc_cfg_t      vp8_pp_cfg = {0};
-#endif
-
-    /* Parse command line */
-    exec_name = argv_[0];
-    argv = argv_dup(argc - 1, argv_ + 1);
-
-    for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
-    {
-        memset(&arg, 0, sizeof(arg));
-        arg.argv_step = 1;
-
-        if (arg_match(&arg, &codecarg, argi))
-        {
-            int j, k = -1;
-
-            for (j = 0; j < sizeof(ifaces) / sizeof(ifaces[0]); j++)
-                if (!strcmp(ifaces[j].name, arg.val))
-                    k = j;
-
-            if (k >= 0)
-                iface = ifaces[k].iface;
-            else
-                die("Error: Unrecognized argument (%s) to --codec\n",
-                    arg.val);
-        }
-        else if (arg_match(&arg, &outputfile, argi))
-            fn2 = arg.val;
-        else if (arg_match(&arg, &usey4marg, argi))
-            use_y4m = 1;
-        else if (arg_match(&arg, &prefixarg, argi))
-            prefix = strdup(arg.val);
-        else if (arg_match(&arg, &use_yv12, argi))
-            flipuv = 1;
-        else if (arg_match(&arg, &use_i420, argi))
-            flipuv = 0;
-        else if (arg_match(&arg, &flipuvarg, argi))
-            flipuv = 1;
-        else if (arg_match(&arg, &noblitarg, argi))
-            noblit = 1;
-        else if (arg_match(&arg, &progressarg, argi))
-            progress = 1;
-        else if (arg_match(&arg, &limitarg, argi))
-            stop_after = arg_parse_uint(&arg);
-        else if (arg_match(&arg, &postprocarg, argi))
-            postproc = 1;
-        else if (arg_match(&arg, &md5arg, argi))
-            do_md5 = 1;
-        else if (arg_match(&arg, &summaryarg, argi))
-            summary = 1;
-        else if (arg_match(&arg, &threadsarg, argi))
-            cfg.threads = arg_parse_uint(&arg);
-        else if (arg_match(&arg, &quietarg, argi))
-            quiet = 1;
-
-#if CONFIG_VP8_DECODER
-        else if (arg_match(&arg, &addnoise_level, argi))
-        {
-            postproc = 1;
-            vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE;
-            vp8_pp_cfg.noise_level = arg_parse_uint(&arg);
-        }
-        else if (arg_match(&arg, &demacroblock_level, argi))
-        {
-            postproc = 1;
-            vp8_pp_cfg.post_proc_flag |= VP8_DEMACROBLOCK;
-            vp8_pp_cfg.deblocking_level = arg_parse_uint(&arg);
-        }
-        else if (arg_match(&arg, &deblock, argi))
-        {
-            postproc = 1;
-            vp8_pp_cfg.post_proc_flag |= VP8_DEBLOCK;
-        }
-        else if (arg_match(&arg, &pp_debug_info, argi))
-        {
-            unsigned int level = arg_parse_uint(&arg);
-
-            postproc = 1;
-            vp8_pp_cfg.post_proc_flag &= ~0x7;
-
-            if (level)
-                vp8_pp_cfg.post_proc_flag |= 8 << (level - 1);
-        }
-
-#endif
-        else
-            argj++;
-    }
-
-    /* Check for unrecognized options */
-    for (argi = argv; *argi; argi++)
-        if (argi[0][0] == '-' && strlen(argi[0]) > 1)
-            die("Error: Unrecognized option %s\n", *argi);
-
-    /* Handle non-option arguments */
-    fn = argv[0];
-
-    if (!fn)
-        usage_exit();
-
-    if (!prefix)
-        prefix = strdup("img");
-
-    /* Open file */
-    infile = strcmp(fn, "-") ? fopen(fn, "rb") : stdin;
-
-    if (!infile)
-    {
-        fprintf(stderr, "Failed to open file");
-        return EXIT_FAILURE;
-    }
-
-    if (fn2)
-        out = out_open(fn2, do_md5);
-
-    is_ivf = file_is_ivf(infile, &fourcc, &width, &height,
-                         &timebase_num, &timebase_den);
-
-    if (is_ivf)
-    {
-        if (use_y4m)
-        {
-            char buffer[128];
-            if (!fn2)
-            {
-                fprintf(stderr, "YUV4MPEG2 output only supported with -o.\n");
-                return EXIT_FAILURE;
-            }
-            /*Correct for the factor of 2 applied to the timebase in the
-               encoder.*/
-            if(timebase_den&1)timebase_num<<=1;
-            else timebase_den>>=1;
-            /*Note: We can't output an aspect ratio here because IVF doesn't
-               store one, and neither does VP8.
-              That will have to wait until these tools support WebM natively.*/
-            sprintf(buffer, "YUV4MPEG2 C%s W%u H%u F%u:%u I%c\n",
-                    "420jpeg", width, height, timebase_den, timebase_num, 'p');
-            out_put(out, (unsigned char *)buffer, strlen(buffer), do_md5);
-        }
-
-        /* Try to determine the codec from the fourcc. */
-        for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
-            if ((fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc)
-            {
-                vpx_codec_iface_t  *ivf_iface = ifaces[i].iface;
-
-                if (iface && iface != ivf_iface)
-                    fprintf(stderr, "Notice -- IVF header indicates codec: %s\n",
-                            ifaces[i].name);
-                else
-                    iface = ivf_iface;
-
-                break;
-            }
-    }
-    else if(use_y4m)
-    {
-        fprintf(stderr, "YUV4MPEG2 output only supported from IVF input.\n");
-        return EXIT_FAILURE;
-    }
-
-    if (vpx_codec_dec_init(&decoder, iface ? iface :  ifaces[0].iface, &cfg,
-                           postproc ? VPX_CODEC_USE_POSTPROC : 0))
-    {
-        fprintf(stderr, "Failed to initialize decoder: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-    }
-
-    if (!quiet)
-        fprintf(stderr, "%s\n", decoder.name);
-
-#if CONFIG_VP8_DECODER
-
-    if (vp8_pp_cfg.post_proc_flag
-        && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg))
-    {
-        fprintf(stderr, "Failed to configure postproc: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-    }
-
-#endif
-
-    /* Decode file */
-    while (!read_frame(infile, &buf, &buf_sz, &buf_alloc_sz, is_ivf))
-    {
-        vpx_codec_iter_t  iter = NULL;
-        vpx_image_t    *img;
-        struct vpx_usec_timer timer;
-
-        vpx_usec_timer_start(&timer);
-
-        if (vpx_codec_decode(&decoder, buf, buf_sz, NULL, 0))
-        {
-            const char *detail = vpx_codec_error_detail(&decoder);
-            fprintf(stderr, "Failed to decode frame: %s\n", vpx_codec_error(&decoder));
-
-            if (detail)
-                fprintf(stderr, "  Additional information: %s\n", detail);
-
-            goto fail;
-        }
-
-        vpx_usec_timer_mark(&timer);
-        dx_time += vpx_usec_timer_elapsed(&timer);
-
-        ++frame_in;
-
-        if (progress)
-            fprintf(stderr, "decoded frame %d.\n", frame_in);
-
-        if ((img = vpx_codec_get_frame(&decoder, &iter)))
-            ++frame_out;
-
-        if (!noblit)
-        {
-            if (img)
-            {
-                unsigned int y;
-                char out_fn[128+24];
-                uint8_t *buf;
-                const char *sfx = flipuv ? "yv12" : "i420";
-
-                if (!fn2)
-                {
-                    sprintf(out_fn, "%s-%dx%d-%04d.%s",
-                            prefix, img->d_w, img->d_h, frame_in, sfx);
-                    out = out_open(out_fn, do_md5);
-                }
-                else if(use_y4m)
-                    out_put(out, (unsigned char *)"FRAME\n", 6, do_md5);
-
-                buf = img->planes[VPX_PLANE_Y];
-
-                for (y = 0; y < img->d_h; y++)
-                {
-                    out_put(out, buf, img->d_w, do_md5);
-                    buf += img->stride[VPX_PLANE_Y];
-                }
-
-                buf = img->planes[flipuv?VPX_PLANE_V:VPX_PLANE_U];
-
-                for (y = 0; y < (1 + img->d_h) / 2; y++)
-                {
-                    out_put(out, buf, (1 + img->d_w) / 2, do_md5);
-                    buf += img->stride[VPX_PLANE_U];
-                }
-
-                buf = img->planes[flipuv?VPX_PLANE_U:VPX_PLANE_V];
-
-                for (y = 0; y < (1 + img->d_h) / 2; y++)
-                {
-                    out_put(out, buf, (1 + img->d_w) / 2, do_md5);
-                    buf += img->stride[VPX_PLANE_V];
-                }
-
-                if (!fn2)
-                    out_close(out, out_fn, do_md5);
-            }
-        }
-
-        if (stop_after && frame_in >= stop_after)
-            break;
-    }
-
-    if (summary)
-    {
-        fprintf(stderr, "%d decoded frames/%d showed frames in %lu us (%.2f fps)\n",
-                frame_in, frame_out, dx_time, (float)frame_out * 1000000.0 / (float)dx_time);
-    }
-
-fail:
-
-    if (vpx_codec_destroy(&decoder))
-    {
-        fprintf(stderr, "Failed to destroy decoder: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-    }
-
-    if (fn2)
-        out_close(out, fn2, do_md5);
-
-    free(buf);
-    fclose(infile);
-    free(prefix);
-    free(argv);
-
-    return EXIT_SUCCESS;
-}
diff --git a/libmkv/EbmlBufferWriter.c b/libmkv/EbmlBufferWriter.c
new file mode 100644
index 000000000..d9b04a81a
--- /dev/null
+++ b/libmkv/EbmlBufferWriter.c
@@ -0,0 +1,60 @@
+//#include <strmif.h>
+#include "EbmlBufferWriter.h"
+#include "EbmlWriter.h"
+//#include <cassert>
+//#include <limits>
+//#include <malloc.h>  //_alloca
+#include <stdlib.h>
+#include <wchar.h>
+#include <string.h>
+
+void Ebml_Write(EbmlGlobal *glob, const void *buffer_in, unsigned long len)
+{
+    unsigned char *src = glob->buf;
+    src += glob->offset;
+    memcpy(src, buffer_in, len);
+    glob->offset += len;
+}
+
+static void _Serialize(EbmlGlobal *glob, const unsigned char *p, const unsigned char *q)
+{
+    while (q != p)
+    {
+        --q;
+
+        unsigned long cbWritten;
+        memcpy(&(glob->buf[glob->offset]), q, 1);
+        glob->offset ++;
+    }
+}
+
+void Ebml_Serialize(EbmlGlobal *glob, const void *buffer_in, unsigned long len)
+{
+    //assert(buf);
+
+    const unsigned char *const p = (const unsigned char *)(buffer_in);
+    const unsigned char *const q = p + len;
+
+    _Serialize(glob, p, q);
+}
+
+
+void Ebml_StartSubElement(EbmlGlobal *glob, EbmlLoc *ebmlLoc, unsigned long class_id)
+{
+    Ebml_WriteID(glob, class_id);
+    ebmlLoc->offset = glob->offset;
+    //todo this is always taking 8 bytes, this may need later optimization
+    unsigned long long unknownLen =  0x01FFFFFFFFFFFFFFLLU;
+    Ebml_Serialize(glob, (void *)&unknownLen, 8); //this is a key that says lenght unknown
+}
+
+void Ebml_EndSubElement(EbmlGlobal *glob, EbmlLoc *ebmlLoc)
+{
+    unsigned long long size = glob->offset - ebmlLoc->offset - 8;
+    unsigned long long curOffset = glob->offset;
+    glob->offset = ebmlLoc->offset;
+    size |=  0x0100000000000000LLU;
+    Ebml_Serialize(glob, &size, 8);
+    glob->offset = curOffset;
+}
+
diff --git a/libmkv/EbmlBufferWriter.h b/libmkv/EbmlBufferWriter.h
new file mode 100644
index 000000000..ba0a9b3ab
--- /dev/null
+++ b/libmkv/EbmlBufferWriter.h
@@ -0,0 +1,21 @@
+#ifndef EBMLBUFFERWRITER_HPP
+#define EBMLBUFFERWRITER_HPP
+
+typedef struct
+{
+    unsigned long long offset;
+} EbmlLoc;
+
+typedef struct
+{
+    unsigned char *buf;
+    unsigned int length;
+    unsigned int offset;
+} EbmlGlobal;
+
+
+void Ebml_StartSubElement(EbmlGlobal *glob, EbmlLoc *ebmlLoc, unsigned long class_id);
+void Ebml_EndSubElement(EbmlGlobal *glob,  EbmlLoc *ebmlLoc);
+
+
+#endif
diff --git a/libmkv/EbmlIDs.h b/libmkv/EbmlIDs.h
new file mode 100644
index 000000000..429747063
--- /dev/null
+++ b/libmkv/EbmlIDs.h
@@ -0,0 +1,231 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+
+#ifndef MKV_DEFS_HPP
+#define MKV_DEFS_HPP 1
+
+//Commenting out values not available in webm, but available in matroska
+
+enum mkv
+{
+    EBML = 0x1A45DFA3,
+    EBMLVersion = 0x4286,
+    EBMLReadVersion = 0x42F7,
+    EBMLMaxIDLength = 0x42F2,
+    EBMLMaxSizeLength = 0x42F3,
+    DocType = 0x4282,
+    DocTypeVersion = 0x4287,
+    DocTypeReadVersion = 0x4285,
+//  CRC_32 = 0xBF,
+    Void = 0xEC,
+    SignatureSlot = 0x1B538667,
+    SignatureAlgo = 0x7E8A,
+    SignatureHash = 0x7E9A,
+    SignaturePublicKey = 0x7EA5,
+    Signature = 0x7EB5,
+    SignatureElements = 0x7E5B,
+    SignatureElementList = 0x7E7B,
+    SignedElement = 0x6532,
+    //segment
+    Segment = 0x18538067,
+    //Meta Seek Information
+    SeekHead = 0x114D9B74,
+    Seek = 0x4DBB,
+    SeekID = 0x53AB,
+    SeekPosition = 0x53AC,
+    //Segment Information
+    Info = 0x1549A966,
+//  SegmentUID = 0x73A4,
+//  SegmentFilename = 0x7384,
+//  PrevUID = 0x3CB923,
+//  PrevFilename = 0x3C83AB,
+//  NextUID = 0x3EB923,
+//  NextFilename = 0x3E83BB,
+//  SegmentFamily = 0x4444,
+//  ChapterTranslate = 0x6924,
+//  ChapterTranslateEditionUID = 0x69FC,
+//  ChapterTranslateCodec = 0x69BF,
+//  ChapterTranslateID = 0x69A5,
+    TimecodeScale = 0x2AD7B1,
+    Segment_Duration = 0x4489,
+    DateUTC = 0x4461,
+//  Title = 0x7BA9,
+    MuxingApp = 0x4D80,
+    WritingApp = 0x5741,
+    //Cluster
+    Cluster = 0x1F43B675,
+    Timecode = 0xE7,
+//  SilentTracks = 0x5854,
+//  SilentTrackNumber = 0x58D7,
+//  Position = 0xA7,
+    PrevSize = 0xAB,
+    BlockGroup = 0xA0,
+    Block = 0xA1,
+//  BlockVirtual = 0xA2,
+//  BlockAdditions = 0x75A1,
+//  BlockMore = 0xA6,
+//  BlockAddID = 0xEE,
+//  BlockAdditional = 0xA5,
+    BlockDuration = 0x9B,
+//  ReferencePriority = 0xFA,
+    ReferenceBlock = 0xFB,
+//  ReferenceVirtual = 0xFD,
+//  CodecState = 0xA4,
+//  Slices = 0x8E,
+//  TimeSlice = 0xE8,
+    LaceNumber = 0xCC,
+//  FrameNumber = 0xCD,
+//  BlockAdditionID = 0xCB,
+//  MkvDelay = 0xCE,
+//  Cluster_Duration = 0xCF,
+    SimpleBlock = 0xA3,
+//  EncryptedBlock = 0xAF,
+    //Track
+    Tracks = 0x1654AE6B,
+    TrackEntry = 0xAE,
+    TrackNumber = 0xD7,
+    TrackUID = 0x73C5,
+    TrackType = 0x83,
+    FlagEnabled = 0xB9,
+    FlagDefault = 0x88,
+    FlagForced = 0x55AA,
+    FlagLacing = 0x9C,
+//  MinCache = 0x6DE7,
+//  MaxCache = 0x6DF8,
+    DefaultDuration = 0x23E383,
+//  TrackTimecodeScale = 0x23314F,
+//  TrackOffset = 0x537F,
+//  MaxBlockAdditionID = 0x55EE,
+    Name = 0x536E,
+    Language = 0x22B59C,
+    CodecID = 0x86,
+    CodecPrivate = 0x63A2,
+    CodecName = 0x258688,
+//  AttachmentLink = 0x7446,
+//  CodecSettings = 0x3A9697,
+//  CodecInfoURL = 0x3B4040,
+//  CodecDownloadURL = 0x26B240,
+//  CodecDecodeAll = 0xAA,
+//  TrackOverlay = 0x6FAB,
+//  TrackTranslate = 0x6624,
+//  TrackTranslateEditionUID = 0x66FC,
+//  TrackTranslateCodec = 0x66BF,
+//  TrackTranslateTrackID = 0x66A5,
+    //video
+    Video = 0xE0,
+    FlagInterlaced = 0x9A,
+//  StereoMode = 0x53B8,
+    PixelWidth = 0xB0,
+    PixelHeight = 0xBA,
+    PixelCropBottom = 0x54AA,
+    PixelCropTop = 0x54BB,
+    PixelCropLeft = 0x54CC,
+    PixelCropRight = 0x54DD,
+    DisplayWidth = 0x54B0,
+    DisplayHeight = 0x54BA,
+    DisplayUnit = 0x54B2,
+    AspectRatioType = 0x54B3,
+//  ColourSpace = 0x2EB524,
+//  GammaValue = 0x2FB523,
+    FrameRate = 0x2383E3,
+    //end video
+    //audio
+    Audio = 0xE1,
+    SamplingFrequency = 0xB5,
+    OutputSamplingFrequency = 0x78B5,
+    Channels = 0x9F,
+//  ChannelPositions = 0x7D7B,
+    BitDepth = 0x6264,
+    //end audio
+    //content encoding
+//  ContentEncodings = 0x6d80,
+//  ContentEncoding = 0x6240,
+//  ContentEncodingOrder = 0x5031,
+//  ContentEncodingScope = 0x5032,
+//  ContentEncodingType = 0x5033,
+//  ContentCompression = 0x5034,
+//  ContentCompAlgo = 0x4254,
+//  ContentCompSettings = 0x4255,
+//  ContentEncryption = 0x5035,
+//  ContentEncAlgo = 0x47e1,
+//  ContentEncKeyID = 0x47e2,
+//  ContentSignature = 0x47e3,
+//  ContentSigKeyID = 0x47e4,
+//  ContentSigAlgo = 0x47e5,
+//  ContentSigHashAlgo = 0x47e6,
+    //end content encoding
+    //Cueing Data
+    Cues = 0x1C53BB6B,
+    CuePoint = 0xBB,
+    CueTime = 0xB3,
+    CueTrackPositions = 0xB7,
+    CueTrack = 0xF7,
+    CueClusterPosition = 0xF1,
+    CueBlockNumber = 0x5378,
+//  CueCodecState = 0xEA,
+//  CueReference = 0xDB,
+//  CueRefTime = 0x96,
+//  CueRefCluster = 0x97,
+//  CueRefNumber = 0x535F,
+//  CueRefCodecState = 0xEB,
+    //Attachment
+//  Attachments = 0x1941A469,
+//  AttachedFile = 0x61A7,
+//  FileDescription = 0x467E,
+//  FileName = 0x466E,
+//  FileMimeType = 0x4660,
+//  FileData = 0x465C,
+//  FileUID = 0x46AE,
+//  FileReferral = 0x4675,
+    //Chapters
+//  Chapters = 0x1043A770,
+//  EditionEntry = 0x45B9,
+//  EditionUID = 0x45BC,
+//  EditionFlagHidden = 0x45BD,
+//  EditionFlagDefault = 0x45DB,
+//  EditionFlagOrdered = 0x45DD,
+//  ChapterAtom = 0xB6,
+//  ChapterUID = 0x73C4,
+//  ChapterTimeStart = 0x91,
+//  ChapterTimeEnd = 0x92,
+//  ChapterFlagHidden = 0x98,
+//  ChapterFlagEnabled = 0x4598,
+//  ChapterSegmentUID = 0x6E67,
+//  ChapterSegmentEditionUID = 0x6EBC,
+//  ChapterPhysicalEquiv = 0x63C3,
+//  ChapterTrack = 0x8F,
+//  ChapterTrackNumber = 0x89,
+//  ChapterDisplay = 0x80,
+//  ChapString = 0x85,
+//  ChapLanguage = 0x437C,
+//  ChapCountry = 0x437E,
+//  ChapProcess = 0x6944,
+//  ChapProcessCodecID = 0x6955,
+//  ChapProcessPrivate = 0x450D,
+//  ChapProcessCommand = 0x6911,
+//  ChapProcessTime = 0x6922,
+//  ChapProcessData = 0x6933,
+    //Tagging
+//  Tags = 0x1254C367,
+//  Tag = 0x7373,
+//  Targets = 0x63C0,
+//  TargetTypeValue = 0x68CA,
+//  TargetType = 0x63CA,
+//  Tagging_TrackUID = 0x63C5,
+//  Tagging_EditionUID = 0x63C9,
+//  Tagging_ChapterUID = 0x63C4,
+//  AttachmentUID = 0x63C6,
+//  SimpleTag = 0x67C8,
+//  TagName = 0x45A3,
+//  TagLanguage = 0x447A,
+//  TagDefault = 0x4484,
+//  TagString = 0x4487,
+//  TagBinary = 0x4485,
+};
+#endif
diff --git a/libmkv/EbmlWriter.c b/libmkv/EbmlWriter.c
new file mode 100644
index 000000000..9d564c177
--- /dev/null
+++ b/libmkv/EbmlWriter.c
@@ -0,0 +1,166 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+
+#include "EbmlWriter.h"
+#include <stdlib.h>
+#include <wchar.h>
+#include <string.h>
+#if defined(_MSC_VER)
+#define LITERALU64(n) n
+#else
+#define LITERALU64(n) n##LLU
+#endif
+
+void Ebml_WriteLen(EbmlGlobal *glob, long long val)
+{
+    //TODO check and make sure we are not > than 0x0100000000000000LLU
+    unsigned char size = 8; //size in bytes to output
+    unsigned long long minVal = LITERALU64(0x00000000000000ff); //mask to compare for byte size
+
+    for (size = 1; size < 8; size ++)
+    {
+        if (val < minVal)
+            break;
+
+        minVal = (minVal << 7);
+    }
+
+    val |= (LITERALU64(0x000000000000080) << ((size - 1) * 7));
+
+    Ebml_Serialize(glob, (void *) &val, size);
+}
+
+void Ebml_WriteString(EbmlGlobal *glob, const char *str)
+{
+    const size_t size_ = strlen(str);
+    const unsigned long long  size = size_;
+    Ebml_WriteLen(glob, size);
+    //TODO: it's not clear from the spec whether the nul terminator
+    //should be serialized too.  For now we omit the null terminator.
+    Ebml_Write(glob, str, size);
+}
+
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr)
+{
+    const size_t strlen = wcslen(wstr);
+
+    //TODO: it's not clear from the spec whether the nul terminator
+    //should be serialized too.  For now we include it.
+    const unsigned long long  size = strlen;
+
+    Ebml_WriteLen(glob, size);
+    Ebml_Write(glob, wstr, size);
+}
+
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id)
+{
+    if (class_id >= 0x01000000)
+        Ebml_Serialize(glob, (void *)&class_id, 4);
+    else if (class_id >= 0x00010000)
+        Ebml_Serialize(glob, (void *)&class_id, 3);
+    else if (class_id >= 0x00000100)
+        Ebml_Serialize(glob, (void *)&class_id, 2);
+    else
+        Ebml_Serialize(glob, (void *)&class_id, 1);
+}
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui)
+{
+    unsigned char sizeSerialized = 8 | 0x80;
+    Ebml_WriteID(glob, class_id);
+    Ebml_Serialize(glob, &sizeSerialized, 1);
+    Ebml_Serialize(glob, &ui, 8);
+}
+
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui)
+{
+    unsigned char size = 8; //size in bytes to output
+    unsigned char sizeSerialized = 0;
+    unsigned long minVal;
+
+    Ebml_WriteID(glob, class_id);
+    minVal = 0x7fLU; //mask to compare for byte size
+
+    for (size = 1; size < 4; size ++)
+    {
+        if (ui < minVal)
+        {
+            break;
+        }
+
+        minVal <<= 7;
+    }
+
+    sizeSerialized = 0x80 | size;
+    Ebml_Serialize(glob, &sizeSerialized, 1);
+    Ebml_Serialize(glob, &ui, size);
+}
+//TODO: perhaps this is a poor name for this id serializer helper function
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin)
+{
+    int size;
+    for (size=4; size > 1; size--)
+    {
+        if (bin & 0x000000ff << ((size-1) * 8))
+            break;
+    }
+    Ebml_WriteID(glob, class_id);
+    Ebml_WriteLen(glob, size);
+    Ebml_WriteID(glob, bin);
+}
+
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d)
+{
+    unsigned char len = 0x88;
+
+    Ebml_WriteID(glob, class_id);
+    Ebml_Serialize(glob, &len, 1);
+    Ebml_Serialize(glob,  &d, 8);
+}
+
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val)
+{
+    signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
+    Ebml_Serialize(glob, &out, 3);
+}
+
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s)
+{
+    Ebml_WriteID(glob, class_id);
+    Ebml_WriteString(glob, s);
+}
+
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s)
+{
+    Ebml_WriteID(glob,  class_id);
+    Ebml_WriteUTF8(glob,  s);
+}
+
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length)
+{
+    unsigned char size = 4;
+    Ebml_WriteID(glob, class_id);
+    Ebml_WriteLen(glob, data_length);
+    Ebml_Write(glob,  data, data_length);
+}
+
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize)
+{
+    unsigned char tmp = 0;
+    unsigned long i = 0;
+
+    Ebml_WriteID(glob, 0xEC);
+    Ebml_WriteLen(glob, vSize);
+
+    for (i = 0; i < vSize; i++)
+    {
+        Ebml_Write(glob, &tmp, 1);
+    }
+}
+
+//TODO Serialize Date
diff --git a/libmkv/EbmlWriter.h b/libmkv/EbmlWriter.h
new file mode 100644
index 000000000..8c7fe7c66
--- /dev/null
+++ b/libmkv/EbmlWriter.h
@@ -0,0 +1,38 @@
+#ifndef EBMLWRITER_HPP
+#define EBMLWRITER_HPP
+
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+//note: you must define write and serialize functions as well as your own EBML_GLOBAL
+//These functions MUST be implemented
+#include <stddef.h>
+#include "vpx/vpx_integer.h"
+
+typedef struct EbmlGlobal EbmlGlobal;
+void  Ebml_Serialize(EbmlGlobal *glob, const void *, unsigned long);
+void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
+/////
+
+
+void Ebml_WriteLen(EbmlGlobal *glob, long long val);
+void Ebml_WriteString(EbmlGlobal *glob, const char *str);
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui);
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
+//TODO make this more generic to signed
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
+//TODO need date function
+#endif
diff --git a/libmkv/Makefile b/libmkv/Makefile
new file mode 100644
index 000000000..b53377b21
--- /dev/null
+++ b/libmkv/Makefile
@@ -0,0 +1,25 @@
+#Variables
+CC=gcc
+LINKER=gcc
+FLAGS=
+
+
+#Build Targets
+EbmlWriter.o: EbmlWriter.c EbmlWriter.h
+	$(CC) $(FLAGS) -c EbmlWriter.c
+
+EbmlBufferWriter.o: EbmlBufferWriter.c EbmlBufferWriter.h
+	$(CC) $(FLAGS) -c EbmlBufferWriter.c
+	
+MkvElement.o: MkvElement.c WebMElement.h
+	$(CC) $(FLAGS) -c MkvElement.c
+	
+testlibmkv.o: testlibmkv.c
+	$(CC) $(FLAGS) -c testlibmkv.c
+	
+testlibmkv: testlibmkv.o MkvElement.o EbmlBufferWriter.o EbmlWriter.o
+	$(LINKER) $(FLAGS) testlibmkv.o MkvElement.o EbmlBufferWriter.o EbmlWriter.o -o testlibmkv
+
+clean:
+	rm -rf *.o testlibmkv
+	
+\ No newline at end of file
diff --git a/libmkv/WebMElement.c b/libmkv/WebMElement.c
new file mode 100644
index 000000000..25a90249a
--- /dev/null
+++ b/libmkv/WebMElement.c
@@ -0,0 +1,220 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+
+#include "EbmlBufferWriter.h"
+#include "EbmlIDs.h"
+#include "WebMElement.h"
+#include <stdio.h>
+
+#define kVorbisPrivateMaxSize  4000
+
+void writeHeader(EbmlGlobal *glob)
+{
+    EbmlLoc start;
+    Ebml_StartSubElement(glob, &start, EBML);
+    Ebml_SerializeUnsigned(glob, EBMLVersion, 1);
+    Ebml_SerializeUnsigned(glob, EBMLReadVersion, 1); //EBML Read Version
+    Ebml_SerializeUnsigned(glob, EBMLMaxIDLength, 4); //EBML Max ID Length
+    Ebml_SerializeUnsigned(glob, EBMLMaxSizeLength, 8); //EBML Max Size Length
+    Ebml_SerializeString(glob, DocType, "webm"); //Doc Type
+    Ebml_SerializeUnsigned(glob, DocTypeVersion, 2); //Doc Type Version
+    Ebml_SerializeUnsigned(glob, DocTypeReadVersion, 2); //Doc Type Read Version
+    Ebml_EndSubElement(glob, &start);
+}
+
+void writeSimpleBlock(EbmlGlobal *glob, unsigned char trackNumber, short timeCode,
+                      int isKeyframe, unsigned char lacingFlag, int discardable,
+                      unsigned char *data, unsigned long dataLength)
+{
+    Ebml_WriteID(glob, SimpleBlock);
+    unsigned long blockLength = 4 + dataLength;
+    blockLength |= 0x10000000; //TODO check length < 0x0FFFFFFFF
+    Ebml_Serialize(glob, &blockLength, 4);
+    trackNumber |= 0x80;  //TODO check track nubmer < 128
+    Ebml_Write(glob, &trackNumber, 1);
+    //Ebml_WriteSigned16(glob, timeCode,2); //this is 3 bytes
+    Ebml_Serialize(glob, &timeCode, 2);
+    unsigned char flags = 0x00 | (isKeyframe ? 0x80 : 0x00) | (lacingFlag << 1) | discardable;
+    Ebml_Write(glob, &flags, 1);
+    Ebml_Write(glob, data, dataLength);
+}
+
+static UInt64 generateTrackID(unsigned int trackNumber)
+{
+    UInt64 t = time(NULL) * trackNumber;
+    UInt64 r = rand();
+    r = r << 32;
+    r +=  rand();
+    UInt64 rval = t ^ r;
+    return rval;
+}
+
+void writeVideoTrack(EbmlGlobal *glob, unsigned int trackNumber, int flagLacing,
+                     char *codecId, unsigned int pixelWidth, unsigned int pixelHeight,
+                     double frameRate)
+{
+    EbmlLoc start;
+    Ebml_StartSubElement(glob, &start, TrackEntry);
+    Ebml_SerializeUnsigned(glob, TrackNumber, trackNumber);
+    UInt64 trackID = generateTrackID(trackNumber);
+    Ebml_SerializeUnsigned(glob, TrackUID, trackID);
+    Ebml_SerializeString(glob, CodecName, "VP8");  //TODO shouldn't be fixed
+
+    Ebml_SerializeUnsigned(glob, TrackType, 1); //video is always 1
+    Ebml_SerializeString(glob, CodecID, codecId);
+    {
+        EbmlLoc videoStart;
+        Ebml_StartSubElement(glob, &videoStart, Video);
+        Ebml_SerializeUnsigned(glob, PixelWidth, pixelWidth);
+        Ebml_SerializeUnsigned(glob, PixelHeight, pixelHeight);
+        Ebml_SerializeFloat(glob, FrameRate, frameRate);
+        Ebml_EndSubElement(glob, &videoStart); //Video
+    }
+    Ebml_EndSubElement(glob, &start); //Track Entry
+}
+void writeAudioTrack(EbmlGlobal *glob, unsigned int trackNumber, int flagLacing,
+                     char *codecId, double samplingFrequency, unsigned int channels,
+                     unsigned char *private, unsigned long privateSize)
+{
+    EbmlLoc start;
+    Ebml_StartSubElement(glob, &start, TrackEntry);
+    Ebml_SerializeUnsigned(glob, TrackNumber, trackNumber);
+    UInt64 trackID = generateTrackID(trackNumber);
+    Ebml_SerializeUnsigned(glob, TrackUID, trackID);
+    Ebml_SerializeUnsigned(glob, TrackType, 2); //audio is always 2
+    //I am using defaults for thesed required fields
+    /*  Ebml_SerializeUnsigned(glob, FlagEnabled, 1);
+        Ebml_SerializeUnsigned(glob, FlagDefault, 1);
+        Ebml_SerializeUnsigned(glob, FlagForced, 1);
+        Ebml_SerializeUnsigned(glob, FlagLacing, flagLacing);*/
+    Ebml_SerializeString(glob, CodecID, codecId);
+    Ebml_SerializeData(glob, CodecPrivate, private, privateSize);
+
+    Ebml_SerializeString(glob, CodecName, "VORBIS");  //fixed for now
+    {
+        EbmlLoc AudioStart;
+        Ebml_StartSubElement(glob, &AudioStart, Audio);
+        Ebml_SerializeFloat(glob, SamplingFrequency, samplingFrequency);
+        Ebml_SerializeUnsigned(glob, Channels, channels);
+        Ebml_EndSubElement(glob, &AudioStart);
+    }
+    Ebml_EndSubElement(glob, &start);
+}
+void writeSegmentInformation(EbmlGlobal *ebml, EbmlLoc* startInfo, unsigned long timeCodeScale, double duration)
+{
+    Ebml_StartSubElement(ebml, startInfo, Info);
+    Ebml_SerializeUnsigned(ebml, TimecodeScale, timeCodeScale);
+    Ebml_SerializeFloat(ebml, Segment_Duration, duration * 1000.0); //Currently fixed to using milliseconds
+    Ebml_SerializeString(ebml, 0x4D80, "QTmuxingAppLibWebM-0.0.1");
+    Ebml_SerializeString(ebml, 0x5741, "QTwritingAppLibWebM-0.0.1");
+    Ebml_EndSubElement(ebml, startInfo);
+}
+
+/*
+void Mkv_InitializeSegment(Ebml& ebml_out, EbmlLoc& ebmlLoc)
+{
+    Ebml_StartSubElement(ebml_out, ebmlLoc, 0x18538067);
+}
+
+void Mkv_InitializeSeek(Ebml& ebml_out, EbmlLoc& ebmlLoc)
+{
+    Ebml_StartSubElement(ebml_out, ebmlLoc, 0x114d9b74);
+}
+void Mkv_WriteSeekInformation(Ebml& ebml_out, SeekStruct& seekInformation)
+{
+    EbmlLoc ebmlLoc;
+    Ebml_StartSubElement(ebml_out, ebmlLoc, 0x4dbb);
+    Ebml_SerializeString(ebml_out, 0x53ab, seekInformation.SeekID);
+    Ebml_SerializeUnsigned(ebml_out, 0x53ac, seekInformation.SeekPosition);
+    Ebml_EndSubElement(ebml_out, ebmlLoc);
+}
+
+void Mkv_WriteSegmentInformation(Ebml& ebml_out, SegmentInformationStruct& segmentInformation)
+{
+    Ebml_SerializeUnsigned(ebml_out, 0x73a4, segmentInformation.segmentUID);
+    if (segmentInformation.filename != 0)
+        Ebml_SerializeString(ebml_out, 0x7384, segmentInformation.filename);
+    Ebml_SerializeUnsigned(ebml_out, 0x2AD7B1, segmentInformation.TimecodeScale);
+    Ebml_SerializeUnsigned(ebml_out, 0x4489, segmentInformation.Duration);
+    //TODO date
+    Ebml_SerializeWString(ebml_out, 0x4D80, L"MKVMUX");
+    Ebml_SerializeWString(ebml_out, 0x5741, segmentInformation.WritingApp);
+}
+
+void Mkv_InitializeTrack(Ebml& ebml_out, EbmlLoc& ebmlLoc)
+{
+    Ebml_StartSubElement(ebml_out, ebmlLoc, 0x1654AE6B);
+}
+
+static void Mkv_WriteGenericTrackData(Ebml& ebml_out, TrackStruct& track)
+{
+    Ebml_SerializeUnsigned(ebml_out, 0xD7, track.TrackNumber);
+    Ebml_SerializeUnsigned(ebml_out, 0x73C5, track.TrackUID);
+    Ebml_SerializeUnsigned(ebml_out, 0x83, track.TrackType);
+    Ebml_SerializeUnsigned(ebml_out, 0xB9, track.FlagEnabled ? 1 :0);
+    Ebml_SerializeUnsigned(ebml_out, 0x88, track.FlagDefault ? 1 :0);
+    Ebml_SerializeUnsigned(ebml_out, 0x55AA, track.FlagForced ? 1 :0);
+    if (track.Language != 0)
+        Ebml_SerializeString(ebml_out, 0x22B59C, track.Language);
+    if (track.CodecID != 0)
+        Ebml_SerializeString(ebml_out, 0x86, track.CodecID);
+    if (track.CodecPrivate != 0)
+        Ebml_SerializeData(ebml_out, 0x63A2, track.CodecPrivate, track.CodecPrivateLength);
+    if (track.CodecName != 0)
+        Ebml_SerializeWString(ebml_out, 0x258688, track.CodecName);
+}
+
+void Mkv_WriteVideoTrack(Ebml& ebml_out, TrackStruct & track, VideoTrackStruct& video)
+{
+    EbmlLoc trackHeadLoc, videoHeadLoc;
+    Ebml_StartSubElement(ebml_out, trackHeadLoc, 0xAE);  //start Track
+    Mkv_WriteGenericTrackData(ebml_out, track);
+    Ebml_StartSubElement(ebml_out, videoHeadLoc, 0xE0);  //start Video
+    Ebml_SerializeUnsigned(ebml_out, 0x9A, video.FlagInterlaced ? 1 :0);
+    Ebml_SerializeUnsigned(ebml_out, 0xB0, video.PixelWidth);
+    Ebml_SerializeUnsigned(ebml_out, 0xBA, video.PixelHeight);
+    Ebml_SerializeUnsigned(ebml_out, 0x54B0, video.PixelDisplayWidth);
+    Ebml_SerializeUnsigned(ebml_out, 0x54BA, video.PixelDisplayHeight);
+    Ebml_SerializeUnsigned(ebml_out, 0x54B2, video.displayUnit);
+    Ebml_SerializeFloat(ebml_out, 0x2383E3, video.FrameRate);
+    Ebml_EndSubElement(ebml_out, videoHeadLoc);
+    Ebml_EndSubElement(ebml_out, trackHeadLoc);
+
+}
+
+void Mkv_WriteAudioTrack(Ebml& ebml_out, TrackStruct & track, AudioTrackStruct& video)
+{
+    EbmlLoc trackHeadLoc, audioHeadLoc;
+    Ebml_StartSubElement(ebml_out, trackHeadLoc, 0xAE);
+    Mkv_WriteGenericTrackData(ebml_out, track);
+    Ebml_StartSubElement(ebml_out, audioHeadLoc, 0xE0);  //start Audio
+    Ebml_SerializeFloat(ebml_out, 0xB5, video.SamplingFrequency);
+    Ebml_SerializeUnsigned(ebml_out, 0x9F, video.Channels);
+    Ebml_SerializeUnsigned(ebml_out, 0x6264, video.BitDepth);
+    Ebml_EndSubElement(ebml_out, audioHeadLoc); // end audio
+    Ebml_EndSubElement(ebml_out, trackHeadLoc);
+}
+
+void Mkv_WriteEbmlClusterHead(Ebml& ebml_out,  EbmlLoc& ebmlLoc, ClusterHeadStruct & clusterHead)
+{
+    Ebml_StartSubElement(ebml_out, ebmlLoc, 0x1F43B675);
+    Ebml_SerializeUnsigned(ebml_out, 0x6264, clusterHead.TimeCode);
+}
+
+void Mkv_WriteSimpleBlockHead(Ebml& ebml_out,  EbmlLoc& ebmlLoc, SimpleBlockStruct& block)
+{
+    Ebml_StartSubElement(ebml_out, ebmlLoc, 0xA3);
+    Ebml_Write1UInt(ebml_out, block.TrackNumber);
+    Ebml_WriteSigned16(ebml_out,block.TimeCode);
+    unsigned char flags = 0x00 | (block.iskey ? 0x80:0x00) | (block.lacing << 1) | block.discardable;
+    Ebml_Write1UInt(ebml_out, flags);  //TODO this may be the wrong function
+    Ebml_Serialize(ebml_out, block.data, block.dataLength);
+    Ebml_EndSubElement(ebml_out,ebmlLoc);
+}
+*/
diff --git a/libmkv/WebMElement.h b/libmkv/WebMElement.h
new file mode 100644
index 000000000..b4208f285
--- /dev/null
+++ b/libmkv/WebMElement.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+
+#ifndef MKV_CONTEXT_HPP
+#define MKV_CONTEXT_HPP 1
+
+void writeSimpleBock(EbmlGlobal *ebml, unsigned char trackNumber, unsigned short timeCode,
+                     int isKeyframe, unsigned char lacingFlag, int  discardable,
+                     unsigned char *data, unsigned long dataLength);
+
+
+// these are helper functions
+void writeHeader(EbmlGlobal *ebml);
+void writeSegmentInformation(EbmlGlobal *ebml, EbmlLoc* startInfo , unsigned long timeCodeScale, double duration);
+//this function is a helper only, it assumes a lot of defaults
+void writeVideoTrack(EbmlGlobal *ebml, unsigned int trackNumber, int flagLacing,
+                     char *codecId, unsigned int pixelWidth, unsigned int pixelHeight,
+                     double frameRate);
+void writeAudioTrack(EbmlGlobal *glob, unsigned int trackNumber, int flagLacing,
+                     char *codecId, double samplingFrequency, unsigned int channels,
+                     unsigned char *private, unsigned long privateSize);
+
+void writeSimpleBlock(EbmlGlobal *ebml, unsigned char trackNumber, short timeCode,
+                      int isKeyframe, unsigned char lacingFlag, int discardable,
+                      unsigned char *data, unsigned long dataLength);
+
+
+
+#endif
+\ No newline at end of file
diff --git a/libmkv/testlibmkv.c b/libmkv/testlibmkv.c
new file mode 100644
index 000000000..7edfc4347
--- /dev/null
+++ b/libmkv/testlibmkv.c
@@ -0,0 +1,63 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+
+
+#include "EbmlIDs.h"
+#include "EbmlBufferWriter.h"
+#include "WebMElement.h"
+
+#include <stdio.h>
+int main(int argc, char *argv[])
+{
+    //init the datatype we're using for ebml output
+    unsigned char data[8192];
+    EbmlGlobal ebml;
+    ebml.buf = data;
+    ebml.offset = 0;
+    ebml.length = 8192;
+
+    writeHeader(&ebml);
+    {
+        EbmlLoc startSegment;
+        Ebml_StartSubElement(&ebml, &startSegment, Segment); //segment
+        {
+            //segment info
+            EbmlLoc startInfo;
+            Ebml_StartSubElement(&ebml, &startInfo, Info);
+            Ebml_SerializeString(&ebml, 0x4D80, "muxingAppLibMkv");
+            Ebml_SerializeString(&ebml, 0x5741, "writingAppLibMkv");
+            Ebml_EndSubElement(&ebml, &startInfo);
+        }
+
+        {
+            EbmlLoc trackStart;
+            Ebml_StartSubElement(&ebml, &trackStart, Tracks);
+            writeVideoTrack(&ebml, 1, 1, "V_MS/VFW/FOURCC", 320, 240, 29.97);
+            //writeAudioTrack(&ebml,2,1, "A_VORBIS", 32000, 1, NULL, 0);
+            Ebml_EndSubElement(&ebml, &trackStart);
+        }
+
+        {
+            EbmlLoc clusterStart;
+            Ebml_StartSubElement(&ebml, &clusterStart, Cluster); //cluster
+            Ebml_SerializeUnsigned(&ebml, Timecode, 0);
+
+            unsigned char someData[4] = {1, 2, 3, 4};
+            writeSimpleBlock(&ebml, 1, 0, 1, 0, 0, someData, 4);
+            Ebml_EndSubElement(&ebml, &clusterStart);
+        }    //end cluster
+        Ebml_EndSubElement(&ebml, &startSegment);
+    }
+
+    //dump ebml stuff to the file
+    FILE *file_out = fopen("test.mkv", "wb");
+    size_t bytesWritten = fwrite(data, 1, ebml.offset, file_out);
+    fclose(file_out);
+    return 0;
+}
+\ No newline at end of file
diff --git a/libs.mk b/libs.mk
index 45cf9bfdc..9ded3945a 100644
--- a/libs.mk
+++ b/libs.mk
@@ -91,7 +91,9 @@ ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emms.asm
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_abi_support.asm
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c
 endif
+CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c
 CODEC_SRCS-$(ARCH_ARM) += $(BUILD_PFX)vpx_config.asm
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
diff --git a/nestegg/AUTHORS b/nestegg/AUTHORS
new file mode 100644
index 000000000..8204f40f4
--- /dev/null
+++ b/nestegg/AUTHORS
@@ -0,0 +1 @@
+Matthew Gregan <kinetik@flim.org>
diff --git a/nestegg/INSTALL b/nestegg/INSTALL
new file mode 100644
index 000000000..401df4184
--- /dev/null
+++ b/nestegg/INSTALL
@@ -0,0 +1,8 @@
+Build instructions for libnestegg
+=================================
+
+0. Change directory into the source directory.
+1. Run |autoreconf --install| to generate configure.
+2. Run |./configure| to configure the build.
+3. Run |make| to build.
+4. Run |make check| to run the test suite.
diff --git a/nestegg/LICENSE b/nestegg/LICENSE
new file mode 100644
index 000000000..a67984a61
--- /dev/null
+++ b/nestegg/LICENSE
@@ -0,0 +1,13 @@
+Copyright © 2010 Mozilla Foundation
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/nestegg/Makefile.am b/nestegg/Makefile.am
new file mode 100644
index 000000000..500699160
--- /dev/null
+++ b/nestegg/Makefile.am
@@ -0,0 +1,51 @@
+AUTOMAKE_OPTIONS = foreign 1.11 no-dist-gzip dist-bzip2 subdir-objects
+ACLOCAL_AMFLAGS = -I m4
+
+INCLUDES = -I$(top_srcdir)/include -I. -I$(top_srcdir)/halloc
+AM_CFLAGS = -ansi -pedantic -Wall -Wextra -Wno-long-long -O0 -g
+
+SUBDIRS = docs
+
+EXTRA_DIST = \
+	AUTHORS README LICENSE \
+	nestegg-uninstalled.pc.in \
+	m4/as-ac-expand.m4 \
+	m4/pkg.m4 \
+	m4/ax_create_stdint_h.m4 \
+	halloc/src/halloc.c \
+	halloc/halloc.h \
+	halloc/src/align.h \
+	halloc/src/hlist.h \
+	halloc/src/macros.h
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = nestegg.pc
+
+nesteggincludedir = $(includedir)/nestegg
+nestegginclude_HEADERS = include/nestegg/nestegg.h include/nestegg/nestegg-stdint.h
+
+lib_LTLIBRARIES = src/libnestegg.la
+
+src_libnestegg_la_SOURCES = \
+	src/nestegg.c \
+	halloc/src/halloc.c \
+	halloc/halloc.h \
+	halloc/src/align.h \
+	halloc/src/hlist.h \
+	halloc/src/macros.h
+
+check_PROGRAMS = test/test
+
+test_test_SOURCES = test/test.c
+test_test_LDADD = src/libnestegg.la
+
+DISTCLEANFILES = include/nestegg/nestegg-stdint.h
+
+dist-hook:
+	find $(distdir) -type d -name '.git' | xargs rm -rf
+
+debug:
+	$(MAKE) all CFLAGS="@DEBUG@"
+
+profile:
+	$(MAKE) all CFLAGS="@PROFILE@"
diff --git a/nestegg/README b/nestegg/README
new file mode 100644
index 000000000..47c8237d2
--- /dev/null
+++ b/nestegg/README
@@ -0,0 +1,6 @@
+See INSTALL for build instructions.
+
+Licensed under an ISC-style license.  See LICENSE for details.
+
+The source under the halloc/ directory is licensed under a BSD license.  See
+halloc/halloc.h for details.
diff --git a/nestegg/TODO b/nestegg/TODO
new file mode 100644
index 000000000..bf0cb04c4
--- /dev/null
+++ b/nestegg/TODO
@@ -0,0 +1,21 @@
+- Document when read, seek, tell callbacks are used.
+- Add an automated testsuite.
+- Test (and fix, if necessary) support for unknown sizes.
+- Test (and fix, if necessary) support for large files.
+- Read past unknown elements rather than seeking.
+- Try to handle unknown elements with unknown sizes.
+- Formalize handling of default element values.
+- Try to resynchronize stream when read_block fails so that failure to parse
+  a single block can be treated as non-fatal.
+- Make logging more useful to API users.
+- Avoid reparsing Cues and ignore any SeekHead at end of file.
+- Optionally build a Cue index as Clusters are parsed.
+- Support seeking without Cues.
+- Avoid building a list of Clusters as they are parsed and retain only the
+  last one parsed.
+- Add an asynchronous error code to struct nestegg and ensure that API calls
+  continue to fail safely one a fatal error has been returned.
+- Modify parser/data structures to provide a clean separation.  Perhaps the
+  parser should return a generic tree of nodes that a second pass uses to
+  initialize the main data structures.
+- Use pool allocator for all allocations.
diff --git a/nestegg/configure.ac b/nestegg/configure.ac
new file mode 100644
index 000000000..70f6e0d59
--- /dev/null
+++ b/nestegg/configure.ac
@@ -0,0 +1,124 @@
+dnl ------------------------------------------------
+dnl Initialization and Versioning
+dnl ------------------------------------------------
+
+AC_INIT(libnestegg,[0.1git])
+
+AC_CANONICAL_HOST
+AC_CANONICAL_TARGET
+
+AC_CONFIG_MACRO_DIR([m4])
+
+AM_CONFIG_HEADER([config.h])
+AC_CONFIG_SRCDIR([src/nestegg.c])
+AM_INIT_AUTOMAKE
+
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+dnl Library versioning
+dnl CURRENT, REVISION, AGE
+dnl - library source changed -> increment REVISION
+dnl - interfaces added/removed/changed -> increment CURRENT, REVISION = 0
+dnl - interfaces added -> increment AGE
+dnl - interfaces removed -> AGE = 0
+
+NESTEGG_CURRENT=0
+NESTEGG_REVISION=0
+NESTEGG_AGE=1
+AC_SUBST(NESTEGG_CURRENT)
+AC_SUBST(NESTEGG_REVISION)
+AC_SUBST(NESTEGG_AGE)
+
+
+dnl --------------------------------------------------  
+dnl Check for programs
+dnl --------------------------------------------------  
+
+dnl save $CFLAGS since AC_PROG_CC likes to insert "-g -O2"
+dnl if $CFLAGS is blank
+cflags_save="$CFLAGS"
+AC_PROG_CC
+AC_PROG_CPP
+CFLAGS="$cflags_save"
+
+AM_PROG_CC_C_O
+AC_LIBTOOL_WIN32_DLL
+AM_PROG_LIBTOOL
+
+dnl Check for doxygen
+AC_ARG_ENABLE([doc],
+	AS_HELP_STRING([--enable-doc], [Build API documentation]),
+	[ac_enable_doc=$enableval], [ac_enable_doc=auto])
+
+if test "x$ac_enable_doc" != "xno"; then
+	AC_CHECK_PROG(HAVE_DOXYGEN, doxygen, true, false)
+
+	if test "x$HAVE_DOXYGEN" = "xfalse" -a "x$ac_enable_doc" = "xyes"; then
+		AC_MSG_ERROR([*** API documentation explicitly requested but Doxygen not found])
+	fi
+else
+	HAVE_DOXYGEN=false
+fi
+AM_CONDITIONAL(HAVE_DOXYGEN,$HAVE_DOXYGEN)
+if test $HAVE_DOXYGEN = "false"; then
+        AC_MSG_WARN([*** doxygen not found, API documentation will not be built])
+fi
+
+# Generate portable stdint.h replacement
+AX_CREATE_STDINT_H(include/nestegg/nestegg-stdint.h)
+
+# Test whenever ld supports -version-script
+AC_PROG_LD
+AC_PROG_LD_GNU
+AC_MSG_CHECKING([how to control symbol export])
+
+dnl --------------------------------------------------
+dnl Do substitutions
+dnl --------------------------------------------------
+
+AC_SUBST(DEBUG)
+AC_SUBST(PROFILE)
+
+AC_OUTPUT([
+  Makefile 
+  docs/Makefile
+  docs/Doxyfile
+  nestegg.pc
+  nestegg-uninstalled.pc
+])
+
+AS_AC_EXPAND(LIBDIR, ${libdir})
+AS_AC_EXPAND(INCLUDEDIR, ${includedir})
+AS_AC_EXPAND(BINDIR, ${bindir})
+AS_AC_EXPAND(DOCDIR, ${docdir})
+
+if test $HAVE_DOXYGEN = "false"; then
+  doc_build="no"
+else
+  doc_build="yes"
+fi
+
+AC_MSG_RESULT([
+------------------------------------------------------------------------
+  $PACKAGE $VERSION:  Automatic configuration OK.
+
+  General configuration:
+
+    API Documentation: .......... ${doc_build}
+
+  Installation paths:
+
+    libnestegg: .................. ${LIBDIR}
+    C header files: .............. ${INCLUDEDIR}/nestegg
+    Documentation: ............... ${DOCDIR}
+
+  Building:
+
+    Type 'make' to compile $PACKAGE.
+
+    Type 'make install' to install $PACKAGE.
+
+  Example programs will be built but not installed.
+------------------------------------------------------------------------
+])
+
diff --git a/nestegg/docs/Doxyfile.in b/nestegg/docs/Doxyfile.in
new file mode 100644
index 000000000..e0e9249aa
--- /dev/null
+++ b/nestegg/docs/Doxyfile.in
@@ -0,0 +1,1551 @@
+# Doxyfile 1.6.2
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = @PACKAGE@
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         = @VERSION@
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = .
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it parses.
+# With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this tag.
+# The format is ext=language, where ext is a file extension, and language is one of
+# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP,
+# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C. Note that for custom extensions you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen to replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penality.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will rougly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = NO
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the (brief and detailed) documentation of class members so that constructors and destructors are listed first. If set to NO (the default) the constructors will appear in the respective orders defined by SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by
+# doxygen. The layout file controls the global structure of the generated output files
+# in an output format independent way. The create the layout file that represents
+# doxygen's defaults, run doxygen with the -l option. You can optionally specify a
+# file name after the option, if omitted DoxygenLayout.xml will be used as the name
+# of the layout file.
+
+LAYOUT_FILE            =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = YES
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = @top_srcdir@/include/nestegg
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER
+# are set, an additional index file will be generated that can be used as input for
+# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated
+# HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add.
+# For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+#  plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be implemented using a PHP enabled web server instead of at the web client using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server based approach is that it scales better to large projects and allows full text search. The disadvances is that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include source code with syntax highlighting in the LaTeX output. Note that which sources are shown also depends on other settings such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/nestegg/docs/Makefile.am b/nestegg/docs/Makefile.am
new file mode 100644
index 000000000..42cf8eec4
--- /dev/null
+++ b/nestegg/docs/Makefile.am
@@ -0,0 +1,38 @@
+doc_DATA = doxygen-build.stamp
+
+EXTRA_DIST = Doxyfile.in
+
+if HAVE_DOXYGEN
+doxygen-build.stamp: Doxyfile
+	doxygen
+	touch doxygen-build.stamp
+else
+doxygen-build.stamp:
+	echo "*** Warning: Doxygen not found; documentation will not be built."
+	touch doxygen-build.stamp
+endif
+
+dist_docdir = $(distdir)/libnestegg
+
+dist-hook:
+	if test -d html; then \
+	  mkdir $(dist_docdir); \
+	  echo -n "copying built documenation..."; \
+	  cp -rp html $(dist_docdir)/html; \
+	  echo "OK"; \
+	fi
+
+
+install-data-local: doxygen-build.stamp
+	$(mkinstalldirs) $(DESTDIR)$(docdir)
+	if test -d html; then \
+	  cp -rp html $(DESTDIR)$(docdir)/html; \
+	fi
+
+uninstall-local:
+	rm -rf $(DESTDIR)$(docdir)
+
+clean-local:
+	if test -d html; then rm -rf html; fi
+	if test -f doxygen-build.stamp; then rm -f doxygen-build.stamp; fi
+
diff --git a/nestegg/halloc/README b/nestegg/halloc/README
new file mode 100644
index 000000000..380fba2b8
--- /dev/null
+++ b/nestegg/halloc/README
@@ -0,0 +1,45 @@
+halloc 1.2.1
+============
+      
+	Hierarchical memory heap interface - an extension to standard
+	malloc/free interface that simplifies tasks of memory disposal 
+	when allocated structures exhibit hierarchical properties.
+
+	http://swapped.cc/halloc
+=
+	To build libhalloc.a with GNU tools run
+		make
+
+	To install in /usr/include and /usr/lib
+		make install
+
+	To cleanup the build files 
+		make clean
+=
+	halloc-1.2.1
+		* fixed a double-free bug in _set_allocator() as per
+		  Matthew Gregan comments
+
+		* switched to using NULL instead of 0 where applicable
+
+	halloc-1.2.0
+		* added missing <string.h> include to halloc.c
+		
+		* improved standard compliance thanks to the feedback
+		  received from Stan Tobias. Two things were fixed -
+		  
+		- hblock_t structure no longer uses zero-sized 'data'
+		  array, which happened to be common, but non-standard
+		  extension; 
+		  
+		- secondly, added the code to test the behaviour of 
+		  realloc(ptr, 0). Standard allows it NOT to act as
+		  free(), in which case halloc will use its own version
+		  of allocator calling free() when neccessary.
+
+	halloc-1.1.0
+		* initial public release (rewrite of hhmalloc library)
+
+=============================================================================
+Copyright (c) 2004-2010, Alex Pankratov (ap@swapped.cc). All rights reserved.
+
diff --git a/nestegg/halloc/halloc.h b/nestegg/halloc/halloc.h
new file mode 100644
index 000000000..10af4e8d8
--- /dev/null
+++ b/nestegg/halloc/halloc.h
@@ -0,0 +1,43 @@
+/*
+ *	Copyright (c) 2004-2010 Alex Pankratov. All rights reserved.
+ *
+ *	Hierarchical memory allocator, 1.2.1
+ *	http://swapped.cc/halloc
+ */
+
+/*
+ *	The program is distributed under terms of BSD license. 
+ *	You can obtain the copy of the license by visiting:
+ *	
+ *	http://www.opensource.org/licenses/bsd-license.php
+ */
+
+#ifndef _LIBP_HALLOC_H_
+#define _LIBP_HALLOC_H_
+
+#include <stddef.h>  /* size_t */
+
+/*
+ *	Core API
+ */
+void * halloc (void * block, size_t len);
+void   hattach(void * block, void * parent);
+
+/*
+ *	standard malloc/free api
+ */
+void * h_malloc (size_t len);
+void * h_calloc (size_t n, size_t len);
+void * h_realloc(void * p, size_t len);
+void   h_free   (void * p);
+char * h_strdup (const char * str);
+
+/*
+ *	the underlying allocator
+ */
+typedef void * (* realloc_t)(void * ptr, size_t len);
+
+extern realloc_t halloc_allocator;
+
+#endif
+
diff --git a/nestegg/halloc/src/align.h b/nestegg/halloc/src/align.h
new file mode 100644
index 000000000..4c6e1831f
--- /dev/null
+++ b/nestegg/halloc/src/align.h
@@ -0,0 +1,36 @@
+/*
+ *	Copyright (c) 2004-2010 Alex Pankratov. All rights reserved.
+ *
+ *	Hierarchical memory allocator, 1.2.1
+ *	http://swapped.cc/halloc
+ */
+
+/*
+ *	The program is distributed under terms of BSD license. 
+ *	You can obtain the copy of the license by visiting:
+ *	
+ *	http://www.opensource.org/licenses/bsd-license.php
+ */
+
+#ifndef _LIBP_ALIGN_H_
+#define _LIBP_ALIGN_H_
+
+/*
+ *	a type with the most strict alignment requirements
+ */
+union max_align
+{
+	char   c;
+	short  s;
+	long   l;
+	int    i;
+	float  f;
+	double d;
+	void * v;
+	void (*q)(void);
+};
+
+typedef union max_align max_align_t;
+
+#endif
+
diff --git a/nestegg/halloc/src/halloc.c b/nestegg/halloc/src/halloc.c
new file mode 100644
index 000000000..38fd6c11a
--- /dev/null
+++ b/nestegg/halloc/src/halloc.c
@@ -0,0 +1,254 @@
+/*
+ *	Copyright (c) 2004i-2010 Alex Pankratov. All rights reserved.
+ *
+ *	Hierarchical memory allocator, 1.2.1
+ *	http://swapped.cc/halloc
+ */
+
+/*
+ *	The program is distributed under terms of BSD license. 
+ *	You can obtain the copy of the license by visiting:
+ *	
+ *	http://www.opensource.org/licenses/bsd-license.php
+ */
+
+#include <stdlib.h>  /* realloc */
+#include <string.h>  /* memset & co */
+
+#include "../halloc.h"
+#include "align.h"
+#include "hlist.h"
+
+/*
+ *	block control header
+ */
+typedef struct hblock
+{
+#ifndef NDEBUG
+#define HH_MAGIC    0x20040518L
+	long          magic;
+#endif
+	hlist_item_t  siblings; /* 2 pointers */
+	hlist_head_t  children; /* 1 pointer  */
+	max_align_t   data[1];  /* not allocated, see below */
+	
+} hblock_t;
+
+#define sizeof_hblock offsetof(hblock_t, data)
+
+/*
+ *
+ */
+realloc_t halloc_allocator = NULL;
+
+#define allocator halloc_allocator
+
+/*
+ *	static methods
+ */
+static void _set_allocator(void);
+static void * _realloc(void * ptr, size_t n);
+
+static int  _relate(hblock_t * b, hblock_t * p);
+static void _free_children(hblock_t * p);
+
+/*
+ *	Core API
+ */
+void * halloc(void * ptr, size_t len)
+{
+	hblock_t * p;
+
+	/* set up default allocator */
+	if (! allocator)
+	{
+		_set_allocator();
+		assert(allocator);
+	}
+
+	/* calloc */
+	if (! ptr)
+	{
+		if (! len)
+			return NULL;
+
+		p = allocator(0, len + sizeof_hblock);
+		if (! p)
+			return NULL;
+#ifndef NDEBUG
+		p->magic = HH_MAGIC;
+#endif
+		hlist_init(&p->children);
+		hlist_init_item(&p->siblings);
+
+		return p->data;
+	}
+
+	p = structof(ptr, hblock_t, data);
+	assert(p->magic == HH_MAGIC);
+
+	/* realloc */
+	if (len)
+	{
+		p = allocator(p, len + sizeof_hblock);
+		if (! p)
+			return NULL;
+
+		hlist_relink(&p->siblings);
+		hlist_relink_head(&p->children);
+		
+		return p->data;
+	}
+
+	/* free */
+	_free_children(p);
+	hlist_del(&p->siblings);
+	allocator(p, 0);
+
+	return NULL;
+}
+
+void hattach(void * block, void * parent)
+{
+	hblock_t * b, * p;
+	
+	if (! block)
+	{
+		assert(! parent);
+		return;
+	}
+
+	/* detach */
+	b = structof(block, hblock_t, data);
+	assert(b->magic == HH_MAGIC);
+
+	hlist_del(&b->siblings);
+
+	if (! parent)
+		return;
+
+	/* attach */
+	p = structof(parent, hblock_t, data);
+	assert(p->magic == HH_MAGIC);
+	
+	/* sanity checks */
+	assert(b != p);          /* trivial */
+	assert(! _relate(p, b)); /* heavy ! */
+
+	hlist_add(&p->children, &b->siblings);
+}
+
+/*
+ *	malloc/free api
+ */
+void * h_malloc(size_t len)
+{
+	return halloc(0, len);
+}
+
+void * h_calloc(size_t n, size_t len)
+{
+	void * ptr = halloc(0, len*=n);
+	return ptr ? memset(ptr, 0, len) : NULL;
+}
+
+void * h_realloc(void * ptr, size_t len)
+{
+	return halloc(ptr, len);
+}
+
+void   h_free(void * ptr)
+{
+	halloc(ptr, 0);
+}
+
+char * h_strdup(const char * str)
+{
+	size_t len = strlen(str);
+	char * ptr = halloc(0, len + 1);
+	return ptr ? (ptr[len] = 0, memcpy(ptr, str, len)) : NULL;
+}
+
+/*
+ *	static stuff
+ */
+static void _set_allocator(void)
+{
+	void * p;
+	assert(! allocator);
+	
+	/*
+	 *	the purpose of the test below is to check the behaviour
+	 *	of realloc(ptr, 0), which is defined in the standard
+	 *	as an implementation-specific. if it returns zero,
+	 *	then it's equivalent to free(). it can however return
+	 *	non-zero, in which case it cannot be used for freeing
+	 *	memory blocks and we'll need to supply our own version
+	 *
+	 *	Thanks to Stan Tobias for pointing this tricky part out.
+	 */
+	allocator = realloc;
+	if (! (p = malloc(1)))
+		/* hmm */
+		return;
+		
+	if ((p = realloc(p, 0)))
+	{
+		/* realloc cannot be used as free() */
+		allocator = _realloc;
+		free(p);
+	}
+}
+
+static void * _realloc(void * ptr, size_t n)
+{
+	/*
+	 *	free'ing realloc()
+	 */
+	if (n)
+		return realloc(ptr, n);
+	free(ptr);
+	return NULL;
+}
+
+static int _relate(hblock_t * b, hblock_t * p)
+{
+	hlist_item_t * i;
+
+	if (!b || !p)
+		return 0;
+
+	/* 
+	 *  since there is no 'parent' pointer, which would've allowed
+	 *  O(log(n)) upward traversal, the check must use O(n) downward 
+	 *  iteration of the entire hierarchy; and this can be VERY SLOW
+	 */
+	hlist_for_each(i, &p->children)
+	{
+		hblock_t * q = structof(i, hblock_t, siblings);
+		if (q == b || _relate(b, q))
+			return 1;
+	}
+	return 0;
+}
+
+static void _free_children(hblock_t * p)
+{
+	hlist_item_t * i, * tmp;
+	
+#ifndef NDEBUG
+	/*
+	 *	this catches loops in hierarchy with almost zero 
+	 *	overhead (compared to _relate() running time)
+	 */
+	assert(p && p->magic == HH_MAGIC);
+	p->magic = 0; 
+#endif
+	hlist_for_each_safe(i, tmp, &p->children)
+	{
+		hblock_t * q = structof(i, hblock_t, siblings);
+		_free_children(q);
+		allocator(q, 0);
+	}
+}
+
diff --git a/nestegg/halloc/src/hlist.h b/nestegg/halloc/src/hlist.h
new file mode 100644
index 000000000..2791f78c7
--- /dev/null
+++ b/nestegg/halloc/src/hlist.h
@@ -0,0 +1,136 @@
+/*
+ *	Copyright (c) 2004-2010 Alex Pankratov. All rights reserved.
+ *
+ *	Hierarchical memory allocator, 1.2.1
+ *	http://swapped.cc/halloc
+ */
+
+/*
+ *	The program is distributed under terms of BSD license. 
+ *	You can obtain the copy of the license by visiting:
+ *	
+ *	http://www.opensource.org/licenses/bsd-license.php
+ */
+
+#ifndef _LIBP_HLIST_H_
+#define _LIBP_HLIST_H_
+
+#include <assert.h>
+#include "macros.h"  /* static_inline */
+
+/*
+ *	weak double-linked list w/ tail sentinel
+ */
+typedef struct hlist_head  hlist_head_t;
+typedef struct hlist_item  hlist_item_t;
+
+/*
+ *
+ */
+struct hlist_head
+{
+	hlist_item_t * next;
+};
+
+struct hlist_item
+{
+	hlist_item_t * next;
+	hlist_item_t ** prev;
+};
+
+/*
+ *	shared tail sentinel
+ */
+struct hlist_item hlist_null;
+
+/*
+ *
+ */
+#define __hlist_init(h)      { &hlist_null }
+#define __hlist_init_item(i) { &hlist_null, &(i).next }
+
+static_inline void hlist_init(hlist_head_t * h);
+static_inline void hlist_init_item(hlist_item_t * i);
+
+/* static_inline void hlist_purge(hlist_head_t * h); */
+
+/* static_inline bool_t hlist_empty(const hlist_head_t * h); */
+
+/* static_inline hlist_item_t * hlist_head(const hlist_head_t * h); */
+
+/* static_inline hlist_item_t * hlist_next(const hlist_item_t * i); */
+/* static_inline hlist_item_t * hlist_prev(const hlist_item_t * i, 
+                                           const hlist_head_t * h); */
+
+static_inline void hlist_add(hlist_head_t * h, hlist_item_t * i);
+
+/* static_inline void hlist_add_prev(hlist_item_t * l, hlist_item_t * i); */
+/* static_inline void hlist_add_next(hlist_item_t * l, hlist_item_t * i); */
+
+static_inline void hlist_del(hlist_item_t * i);
+
+static_inline void hlist_relink(hlist_item_t * i);
+static_inline void hlist_relink_head(hlist_head_t * h);
+
+#define hlist_for_each(i, h) \
+	for (i = (h)->next; i != &hlist_null; i = i->next)
+
+#define hlist_for_each_safe(i, tmp, h) \
+	for (i = (h)->next, tmp = i->next; \
+	     i!= &hlist_null; \
+	     i = tmp, tmp = i->next)
+
+/*
+ *	static
+ */
+static_inline void hlist_init(hlist_head_t * h)
+{
+	assert(h);
+	h->next = &hlist_null;
+}
+
+static_inline void hlist_init_item(hlist_item_t * i)
+{
+	assert(i);
+	i->prev = &i->next;
+	i->next = &hlist_null;
+}
+
+static_inline void hlist_add(hlist_head_t * h, hlist_item_t * i)
+{
+	hlist_item_t * next;
+	assert(h && i);
+	
+	next = i->next = h->next;
+	next->prev = &i->next;
+	h->next = i;
+	i->prev = &h->next;
+}
+
+static_inline void hlist_del(hlist_item_t * i)
+{
+	hlist_item_t * next;
+	assert(i);
+
+	next = i->next;
+	next->prev = i->prev;
+	*i->prev = next;
+	
+	hlist_init_item(i);
+}
+
+static_inline void hlist_relink(hlist_item_t * i)
+{
+	assert(i);
+	*i->prev = i;
+	i->next->prev = &i->next;
+}
+
+static_inline void hlist_relink_head(hlist_head_t * h)
+{
+	assert(h);
+	h->next->prev = &h->next;
+}
+
+#endif
+
diff --git a/nestegg/halloc/src/macros.h b/nestegg/halloc/src/macros.h
new file mode 100644
index 000000000..c36b516ee
--- /dev/null
+++ b/nestegg/halloc/src/macros.h
@@ -0,0 +1,36 @@
+/*
+ *	Copyright (c) 2004-2010 Alex Pankratov. All rights reserved.
+ *
+ *	Hierarchical memory allocator, 1.2.1
+ *	http://swapped.cc/halloc
+ */
+
+/*
+ *	The program is distributed under terms of BSD license. 
+ *	You can obtain the copy of the license by visiting:
+ *	
+ *	http://www.opensource.org/licenses/bsd-license.php
+ */
+
+#ifndef _LIBP_MACROS_H_
+#define _LIBP_MACROS_H_
+
+#include <stddef.h>  /* offsetof */
+
+/*
+ 	restore pointer to the structure by a pointer to its field
+ */
+#define structof(p,t,f) ((t*)(- offsetof(t,f) + (char*)(p)))
+
+/*
+ *	redefine for the target compiler
+ */
+#ifdef _WIN32
+#define static_inline static __inline
+#else
+#define static_inline static __inline__
+#endif
+
+
+#endif
+
diff --git a/nestegg/include/nestegg/nestegg.h b/nestegg/include/nestegg/nestegg.h
new file mode 100644
index 000000000..7447d141d
--- /dev/null
+++ b/nestegg/include/nestegg/nestegg.h
@@ -0,0 +1,292 @@
+/*
+ * Copyright © 2010 Mozilla Foundation
+ *
+ * This program is made available under an ISC-style license.  See the
+ * accompanying file LICENSE for details.
+ */
+#ifndef   NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79
+#define   NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @mainpage
+
+    @section intro Introduction
+
+    This is the documentation fot the <tt>libnestegg</tt> C API.
+    <tt>libnestegg</tt> is a demultiplexing library for <a
+    href="http://www.matroska.org/">Matroska</a> and <a
+    href="http://www.webmproject.org/">WebMedia</a> media files.
+
+    @section example Example code
+
+    @code
+    nestegg * demux_ctx;
+    nestegg_init(&demux_ctx, io, NULL);
+
+    nestegg_packet * pkt;
+    while ((r = nestegg_read_packet(demux_ctx, &pkt)) > 0) {
+      unsigned int track;
+
+      nestegg_packet_track(pkt, &track);
+
+      // This example decodes the first track only.
+      if (track == 0) {
+        unsigned int chunk, chunks;
+
+        nestegg_packet_count(pkt, &chunks);
+
+        // Decode each chunk of data.
+        for (chunk = 0; chunk < chunks; ++chunk) {
+          unsigned char * data;
+          size_t data_size;
+
+          nestegg_packet_data(pkt, chunk, &data, &data_size);
+
+          example_codec_decode(codec_ctx, data, data_size);
+        }
+      }
+
+      nestegg_free_packet(pkt);
+    }
+
+    nestegg_destroy(demux_ctx);
+    @endcode
+*/
+
+
+/** @file
+    The <tt>libnestegg</tt> C API. */
+
+#define NESTEGG_TRACK_VIDEO 0 /**< Track is of type video. */
+#define NESTEGG_TRACK_AUDIO 1 /**< Track is of type audio. */
+
+#define NESTEGG_CODEC_VP8    0 /**< Track uses Google On2 VP8 codec. */
+#define NESTEGG_CODEC_VORBIS 1 /**< Track uses Xiph Vorbis codec. */
+
+#define NESTEGG_SEEK_SET 0 /**< Seek offset relative to beginning of stream. */
+#define NESTEGG_SEEK_CUR 1 /**< Seek offset relative to current position in stream. */
+#define NESTEGG_SEEK_END 2 /**< Seek offset relative to end of stream. */
+
+#define NESTEGG_LOG_DEBUG    1     /**< Debug level log message. */
+#define NESTEGG_LOG_INFO     10    /**< Informational level log message. */
+#define NESTEGG_LOG_WARNING  100   /**< Warning level log message. */
+#define NESTEGG_LOG_ERROR    1000  /**< Error level log message. */
+#define NESTEGG_LOG_CRITICAL 10000 /**< Critical level log message. */
+
+typedef struct nestegg nestegg;               /**< Opaque handle referencing the stream state. */
+typedef struct nestegg_packet nestegg_packet; /**< Opaque handle referencing a packet of data. */
+
+/** User supplied IO context. */
+typedef struct {
+  /** User supplied read callback.
+      @param buffer   Buffer to read data into.
+      @param length   Length of supplied buffer in bytes.
+      @param userdata The #userdata supplied by the user.
+      @retval  1 Read succeeded.
+      @retval  0 End of stream.
+      @retval -1 Error. */
+  int (* read)(void * buffer, size_t length, void * userdata);
+
+  /** User supplied seek callback.
+      @param offset   Offset within the stream to seek to.
+      @param whence   Seek direction.  One of #NESTEGG_SEEK_SET,
+                      #NESTEGG_SEEK_CUR, or #NESTEGG_SEEK_END.
+      @param userdata The #userdata supplied by the user.
+      @retval  0 Seek succeeded.
+      @retval -1 Error. */
+  int (* seek)(int64_t offset, int whence, void * userdata);
+
+  /** User supplied tell callback.
+      @param userdata The #userdata supplied by the user.
+      @returns Current position within the stream.
+      @retval -1 Error. */
+  int64_t (* tell)(void * userdata);
+
+  /** User supplied pointer to be passed to the IO callbacks. */
+  void * userdata;
+} nestegg_io;
+
+/** Parameters specific to a video track. */
+typedef struct {
+  unsigned int width;          /**< Width of the video frame in pixels. */
+  unsigned int height;         /**< Height of the video frame in pixels. */
+  unsigned int display_width;  /**< Display width of the video frame in pixels. */
+  unsigned int display_height; /**< Display height of the video frame in pixels. */
+  unsigned int crop_bottom;    /**< Pixels to crop from the bottom of the frame. */
+  unsigned int crop_top;       /**< Pixels to crop from the top of the frame. */
+  unsigned int crop_left;      /**< Pixels to crop from the left of the frame. */
+  unsigned int crop_right;     /**< Pixels to crop from the right of the frame. */
+} nestegg_video_params;
+
+/** Parameters specific to an audio track. */
+typedef struct {
+  double rate;           /**< Sampling rate in Hz. */
+  unsigned int channels; /**< Number of audio channels. */
+  unsigned int depth;    /**< Bits per sample. */
+} nestegg_audio_params;
+
+/** Logging callback function pointer. */
+typedef void (* nestegg_log)(nestegg * context, unsigned int severity, char const * format, ...);
+
+/** Initialize a nestegg context.  During initialization the parser will
+    read forward in the stream processing all elements until the first
+    block of media is reached.  All track metadata has been processed at this point.
+    @param context  Storage for the new nestegg context.  @see nestegg_destroy
+    @param io       User supplied IO context.
+    @param callback Optional logging callback function pointer.  May be NULL.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_init(nestegg ** context, nestegg_io io, nestegg_log callback);
+
+/** Destroy a nestegg context and free associated memory.
+    @param context #nestegg context to be freed.  @see nestegg_init */
+void nestegg_destroy(nestegg * context);
+
+/** Query the duration of the media stream in nanoseconds.
+    @param context  Stream context initialized by #nestegg_init.
+    @param duration Storage for the queried duration.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_duration(nestegg * context, uint64_t * duration);
+
+/** Query the tstamp scale of the media stream in nanoseconds.
+    Timecodes presented by nestegg have been scaled by this value
+    before presentation to the caller.
+    @param context Stream context initialized by #nestegg_init.
+    @param scale   Storage for the queried scale factor.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_tstamp_scale(nestegg * context, uint64_t * scale);
+
+/** Query the number of tracks in the media stream.
+    @param context Stream context initialized by #nestegg_init.
+    @param tracks  Storage for the queried track count.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_track_count(nestegg * context, unsigned int * tracks);
+
+/** Seek @a track to @a tstamp.  Stream seek will terminate at the earliest
+    key point in the stream at or before @a tstamp.  Other tracks in the
+    stream will output packets with unspecified but nearby timestamps.
+    @param context Stream context initialized by #nestegg_init.
+    @param track   Zero based track number.
+    @param tstamp  Absolute timestamp in nanoseconds.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_track_seek(nestegg * context, unsigned int track, uint64_t tstamp);
+
+/** Query the type specified by @a track.
+    @param context Stream context initialized by #nestegg_init.
+    @param track   Zero based track number.
+    @retval #NESTEGG_TRACK_VIDEO Track type is video.
+    @retval #NESTEGG_TRACK_AUDIO Track type is audio.
+    @retval -1 Error. */
+int nestegg_track_type(nestegg * context, unsigned int track);
+
+/** Query the codec ID specified by @a track.
+    @param context Stream context initialized by #nestegg_init.
+    @param track   Zero based track number.
+    @retval #NESTEGG_CODEC_VP8    Track codec is VP8.
+    @retval #NESTEGG_CODEC_VORBIS Track codec is Vorbis.
+    @retval -1 Error. */
+int nestegg_track_codec_id(nestegg * context, unsigned int track);
+
+/** Query the number of codec initialization chunks for @a track.  Each
+    chunk of data should be passed to the codec initialization functions in
+    the order returned.
+    @param context Stream context initialized by #nestegg_init.
+    @param track   Zero based track number.
+    @param count   Storage for the queried chunk count.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_track_codec_data_count(nestegg * context, unsigned int track,
+                                   unsigned int * count);
+
+/** Get a pointer to chunk number @a item of codec initialization data for
+    @a track.
+    @param context Stream context initialized by #nestegg_init.
+    @param track   Zero based track number.
+    @param item    Zero based chunk item number.
+    @param data    Storage for the queried data pointer.
+                   The data is owned by the #nestegg context.
+    @param length  Storage for the queried data size.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_track_codec_data(nestegg * context, unsigned int track, unsigned int item,
+                             unsigned char ** data, size_t * length);
+
+/** Query the video parameters specified by @a track.
+    @param context Stream context initialized by #nestegg_init.
+    @param track   Zero based track number.
+    @param params  Storage for the queried video parameters.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_track_video_params(nestegg * context, unsigned int track,
+                               nestegg_video_params * params);
+
+/** Query the audio parameters specified by @a track.
+    @param context Stream context initialized by #nestegg_init.
+    @param track   Zero based track number.
+    @param params  Storage for the queried audio parameters.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_track_audio_params(nestegg * context, unsigned int track,
+                               nestegg_audio_params * params);
+
+/** Read a packet of media data.  A packet consists of one or more chunks of
+    data associated with a single track.  nestegg_read_packet should be
+    called in a loop while the return value is 1 to drive the stream parser
+    forward.  @see nestegg_free_packet
+    @param context Context returned by #nestegg_init.
+    @param packet  Storage for the returned nestegg_packet.
+    @retval  1 Additional packets may be read in subsequent calls.
+    @retval  0 End of stream.
+    @retval -1 Error. */
+int nestegg_read_packet(nestegg * context, nestegg_packet ** packet);
+
+/** Destroy a nestegg_packet and free associated memory.
+    @param packet #nestegg_packet to be freed. @see nestegg_read_packet */
+void nestegg_free_packet(nestegg_packet * packet);
+
+/** Query the track number of @a packet.
+    @param packet Packet initialized by #nestegg_read_packet.
+    @param track  Storage for the queried zero based track index.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_packet_track(nestegg_packet * packet, unsigned int * track);
+
+/** Query the time stamp in nanoseconds of @a packet.
+    @param packet Packet initialized by #nestegg_read_packet.
+    @param tstamp Storage for the queried timestamp in nanoseconds.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_packet_tstamp(nestegg_packet * packet, uint64_t * tstamp);
+
+/** Query the number of data chunks contained in @a packet.
+    @param packet Packet initialized by #nestegg_read_packet.
+    @param count  Storage for the queried timestamp in nanoseconds.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_packet_count(nestegg_packet * packet, unsigned int * count);
+
+/** Get a pointer to chunk number @a item of packet data.
+    @param packet  Packet initialized by #nestegg_read_packet.
+    @param item    Zero based chunk item number.
+    @param data    Storage for the queried data pointer.
+                   The data is owned by the #nestegg_packet packet.
+    @param length  Storage for the queried data size.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_packet_data(nestegg_packet * packet, unsigned int item,
+                        unsigned char ** data, size_t * length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79 */
diff --git a/nestegg/m4/as-ac-expand.m4 b/nestegg/m4/as-ac-expand.m4
new file mode 100644
index 000000000..d6c9e3306
--- /dev/null
+++ b/nestegg/m4/as-ac-expand.m4
@@ -0,0 +1,43 @@
+dnl as-ac-expand.m4 0.2.0
+dnl autostars m4 macro for expanding directories using configure's prefix
+dnl thomas@apestaart.org
+
+dnl AS_AC_EXPAND(VAR, CONFIGURE_VAR)
+dnl example
+dnl AS_AC_EXPAND(SYSCONFDIR, $sysconfdir)
+dnl will set SYSCONFDIR to /usr/local/etc if prefix=/usr/local
+
+AC_DEFUN([AS_AC_EXPAND],
+[
+  EXP_VAR=[$1]
+  FROM_VAR=[$2]
+
+  dnl first expand prefix and exec_prefix if necessary
+  prefix_save=$prefix
+  exec_prefix_save=$exec_prefix
+
+  dnl if no prefix given, then use /usr/local, the default prefix
+  if test "x$prefix" = "xNONE"; then
+    prefix="$ac_default_prefix"
+  fi
+  dnl if no exec_prefix given, then use prefix
+  if test "x$exec_prefix" = "xNONE"; then
+    exec_prefix=$prefix
+  fi
+
+  full_var="$FROM_VAR"
+  dnl loop until it doesn't change anymore
+  while true; do
+    new_full_var="`eval echo $full_var`"
+    if test "x$new_full_var" = "x$full_var"; then break; fi
+    full_var=$new_full_var
+  done
+
+  dnl clean up
+  full_var=$new_full_var
+  AC_SUBST([$1], "$full_var")
+
+  dnl restore prefix and exec_prefix
+  prefix=$prefix_save
+  exec_prefix=$exec_prefix_save
+])
diff --git a/nestegg/m4/ax_create_stdint_h.m4 b/nestegg/m4/ax_create_stdint_h.m4
new file mode 100644
index 000000000..228105b11
--- /dev/null
+++ b/nestegg/m4/ax_create_stdint_h.m4
@@ -0,0 +1,695 @@
+dnl @synopsis AX_CREATE_STDINT_H [( HEADER-TO-GENERATE [, HEDERS-TO-CHECK])]
+dnl
+dnl the "ISO C9X: 7.18 Integer types <stdint.h>" section requires the
+dnl existence of an include file <stdint.h> that defines a set of
+dnl typedefs, especially uint8_t,int32_t,uintptr_t. Many older
+dnl installations will not provide this file, but some will have the
+dnl very same definitions in <inttypes.h>. In other enviroments we can
+dnl use the inet-types in <sys/types.h> which would define the typedefs
+dnl int8_t and u_int8_t respectivly.
+dnl
+dnl This macros will create a local "_stdint.h" or the headerfile given
+dnl as an argument. In many cases that file will just "#include
+dnl <stdint.h>" or "#include <inttypes.h>", while in other environments
+dnl it will provide the set of basic 'stdint's definitions/typedefs:
+dnl
+dnl   int8_t,uint8_t,int16_t,uint16_t,int32_t,uint32_t,intptr_t,uintptr_t
+dnl   int_least32_t.. int_fast32_t.. intmax_t
+dnl
+dnl which may or may not rely on the definitions of other files, or
+dnl using the AC_CHECK_SIZEOF macro to determine the actual sizeof each
+dnl type.
+dnl
+dnl if your header files require the stdint-types you will want to
+dnl create an installable file mylib-int.h that all your other
+dnl installable header may include. So if you have a library package
+dnl named "mylib", just use
+dnl
+dnl      AX_CREATE_STDINT_H(mylib-int.h)
+dnl
+dnl in configure.ac and go to install that very header file in
+dnl Makefile.am along with the other headers (mylib.h) - and the
+dnl mylib-specific headers can simply use "#include <mylib-int.h>" to
+dnl obtain the stdint-types.
+dnl
+dnl Remember, if the system already had a valid <stdint.h>, the
+dnl generated file will include it directly. No need for fuzzy
+dnl HAVE_STDINT_H things... (oops, GCC 4.2.x has deliberatly disabled
+dnl its stdint.h for non-c99 compilation and the c99-mode is not the
+dnl default. Therefore this macro will not use the compiler's stdint.h
+dnl - please complain to the GCC developers).
+dnl
+dnl @category C
+dnl @author Guido U. Draheim <guidod@gmx.de>
+dnl @version 2006-10-13
+dnl @license GPLWithACException
+
+AC_DEFUN([AX_CHECK_DATA_MODEL],[
+   AC_CHECK_SIZEOF(char)
+   AC_CHECK_SIZEOF(short)
+   AC_CHECK_SIZEOF(int)
+   AC_CHECK_SIZEOF(long)
+   AC_CHECK_SIZEOF(void*)
+   ac_cv_char_data_model=""
+   ac_cv_char_data_model="$ac_cv_char_data_model$ac_cv_sizeof_char"
+   ac_cv_char_data_model="$ac_cv_char_data_model$ac_cv_sizeof_short"
+   ac_cv_char_data_model="$ac_cv_char_data_model$ac_cv_sizeof_int"
+   ac_cv_long_data_model=""
+   ac_cv_long_data_model="$ac_cv_long_data_model$ac_cv_sizeof_int"
+   ac_cv_long_data_model="$ac_cv_long_data_model$ac_cv_sizeof_long"
+   ac_cv_long_data_model="$ac_cv_long_data_model$ac_cv_sizeof_voidp"
+   AC_MSG_CHECKING([data model])
+   case "$ac_cv_char_data_model/$ac_cv_long_data_model" in
+    122/242)     ac_cv_data_model="IP16"  ; n="standard 16bit machine" ;;
+    122/244)     ac_cv_data_model="LP32"  ; n="standard 32bit machine" ;;
+    122/*)       ac_cv_data_model="i16"   ; n="unusual int16 model" ;;
+    124/444)     ac_cv_data_model="ILP32" ; n="standard 32bit unixish" ;;
+    124/488)     ac_cv_data_model="LP64"  ; n="standard 64bit unixish" ;;
+    124/448)     ac_cv_data_model="LLP64" ; n="unusual 64bit unixish" ;;
+    124/*)       ac_cv_data_model="i32"   ; n="unusual int32 model" ;;
+    128/888)     ac_cv_data_model="ILP64" ; n="unusual 64bit numeric" ;;
+    128/*)       ac_cv_data_model="i64"   ; n="unusual int64 model" ;;
+    222/*2)      ac_cv_data_model="DSP16" ; n="strict 16bit dsptype" ;;
+    333/*3)      ac_cv_data_model="DSP24" ; n="strict 24bit dsptype" ;;
+    444/*4)      ac_cv_data_model="DSP32" ; n="strict 32bit dsptype" ;;
+    666/*6)      ac_cv_data_model="DSP48" ; n="strict 48bit dsptype" ;;
+    888/*8)      ac_cv_data_model="DSP64" ; n="strict 64bit dsptype" ;;
+    222/*|333/*|444/*|666/*|888/*) :
+                 ac_cv_data_model="iDSP"  ; n="unusual dsptype" ;;
+     *)          ac_cv_data_model="none"  ; n="very unusual model" ;;
+   esac
+   AC_MSG_RESULT([$ac_cv_data_model ($ac_cv_long_data_model, $n)])
+])
+
+dnl AX_CHECK_HEADER_STDINT_X([HEADERLIST][,ACTION-IF])
+AC_DEFUN([AX_CHECK_HEADER_STDINT_X],[
+AC_CACHE_CHECK([for stdint uintptr_t], [ac_cv_header_stdint_x],[
+ ac_cv_header_stdint_x="" # the 1997 typedefs (inttypes.h)
+  AC_MSG_RESULT([(..)])
+  for i in m4_ifval([$1],[$1],[stdint.h inttypes.h sys/inttypes.h sys/types.h])
+  do
+   unset ac_cv_type_uintptr_t
+   unset ac_cv_type_uint64_t
+   AC_CHECK_TYPE(uintptr_t,[ac_cv_header_stdint_x=$i],continue,[#include <$i>])
+   AC_CHECK_TYPE(uint64_t,[and64="/uint64_t"],[and64=""],[#include<$i>])
+   m4_ifvaln([$1],[$1]) break
+  done
+  AC_MSG_CHECKING([for stdint uintptr_t])
+ ])
+])
+
+AC_DEFUN([AX_CHECK_HEADER_STDINT_O],[
+AC_CACHE_CHECK([for stdint uint32_t], [ac_cv_header_stdint_o],[
+ ac_cv_header_stdint_o="" # the 1995 typedefs (sys/inttypes.h)
+  AC_MSG_RESULT([(..)])
+  for i in m4_ifval([$1],[$1],[inttypes.h sys/inttypes.h sys/types.h stdint.h])
+  do
+   unset ac_cv_type_uint32_t
+   unset ac_cv_type_uint64_t
+   AC_CHECK_TYPE(uint32_t,[ac_cv_header_stdint_o=$i],continue,[#include <$i>])
+   AC_CHECK_TYPE(uint64_t,[and64="/uint64_t"],[and64=""],[#include<$i>])
+   m4_ifvaln([$1],[$1]) break
+   break;
+  done
+  AC_MSG_CHECKING([for stdint uint32_t])
+ ])
+])
+
+AC_DEFUN([AX_CHECK_HEADER_STDINT_U],[
+AC_CACHE_CHECK([for stdint u_int32_t], [ac_cv_header_stdint_u],[
+ ac_cv_header_stdint_u="" # the BSD typedefs (sys/types.h)
+  AC_MSG_RESULT([(..)])
+  for i in m4_ifval([$1],[$1],[sys/types.h inttypes.h sys/inttypes.h]) ; do
+   unset ac_cv_type_u_int32_t
+   unset ac_cv_type_u_int64_t
+   AC_CHECK_TYPE(u_int32_t,[ac_cv_header_stdint_u=$i],continue,[#include <$i>])
+   AC_CHECK_TYPE(u_int64_t,[and64="/u_int64_t"],[and64=""],[#include<$i>])
+   m4_ifvaln([$1],[$1]) break
+   break;
+  done
+  AC_MSG_CHECKING([for stdint u_int32_t])
+ ])
+])
+
+AC_DEFUN([AX_CREATE_STDINT_H],
+[# ------ AX CREATE STDINT H -------------------------------------
+AC_MSG_CHECKING([for stdint types])
+ac_stdint_h=`echo ifelse($1, , _stdint.h, $1)`
+# try to shortcircuit - if the default include path of the compiler
+# can find a "stdint.h" header then we assume that all compilers can.
+AC_CACHE_VAL([ac_cv_header_stdint_t],[
+old_CXXFLAGS="$CXXFLAGS" ; CXXFLAGS=""
+old_CPPFLAGS="$CPPFLAGS" ; CPPFLAGS=""
+old_CFLAGS="$CFLAGS"     ; CFLAGS=""
+AC_TRY_COMPILE([#include <stdint.h>],[int_least32_t v = 0;],
+[ac_cv_stdint_result="(assuming C99 compatible system)"
+ ac_cv_header_stdint_t="stdint.h"; ],
+[ac_cv_header_stdint_t=""])
+if test "$GCC" = "yes" && test ".$ac_cv_header_stdint_t" = "."; then
+CFLAGS="-std=c99"
+AC_TRY_COMPILE([#include <stdint.h>],[int_least32_t v = 0;],
+[AC_MSG_WARN(your GCC compiler has a defunct stdint.h for its default-mode)])
+fi
+CXXFLAGS="$old_CXXFLAGS"
+CPPFLAGS="$old_CPPFLAGS"
+CFLAGS="$old_CFLAGS" ])
+
+v="... $ac_cv_header_stdint_h"
+if test "$ac_stdint_h" = "stdint.h" ; then
+ AC_MSG_RESULT([(are you sure you want them in ./stdint.h?)])
+elif test "$ac_stdint_h" = "inttypes.h" ; then
+ AC_MSG_RESULT([(are you sure you want them in ./inttypes.h?)])
+elif test "_$ac_cv_header_stdint_t" = "_" ; then
+ AC_MSG_RESULT([(putting them into $ac_stdint_h)$v])
+else
+ ac_cv_header_stdint="$ac_cv_header_stdint_t"
+ AC_MSG_RESULT([$ac_cv_header_stdint (shortcircuit)])
+fi
+
+if test "_$ac_cv_header_stdint_t" = "_" ; then # can not shortcircuit..
+
+dnl .....intro message done, now do a few system checks.....
+dnl btw, all old CHECK_TYPE macros do automatically "DEFINE" a type,
+dnl therefore we use the autoconf implementation detail CHECK_TYPE_NEW
+dnl instead that is triggered with 3 or more arguments (see types.m4)
+
+inttype_headers=`echo $2 | sed -e 's/,/ /g'`
+
+ac_cv_stdint_result="(no helpful system typedefs seen)"
+AX_CHECK_HEADER_STDINT_X(dnl
+   stdint.h inttypes.h sys/inttypes.h $inttype_headers,
+   ac_cv_stdint_result="(seen uintptr_t$and64 in $i)")
+
+if test "_$ac_cv_header_stdint_x" = "_" ; then
+AX_CHECK_HEADER_STDINT_O(dnl,
+   inttypes.h sys/inttypes.h stdint.h $inttype_headers,
+   ac_cv_stdint_result="(seen uint32_t$and64 in $i)")
+fi
+
+if test "_$ac_cv_header_stdint_x" = "_" ; then
+if test "_$ac_cv_header_stdint_o" = "_" ; then
+AX_CHECK_HEADER_STDINT_U(dnl,
+   sys/types.h inttypes.h sys/inttypes.h $inttype_headers,
+   ac_cv_stdint_result="(seen u_int32_t$and64 in $i)")
+fi fi
+
+dnl if there was no good C99 header file, do some typedef checks...
+if test "_$ac_cv_header_stdint_x" = "_" ; then
+   AC_MSG_CHECKING([for stdint datatype model])
+   AC_MSG_RESULT([(..)])
+   AX_CHECK_DATA_MODEL
+fi
+
+if test "_$ac_cv_header_stdint_x" != "_" ; then
+   ac_cv_header_stdint="$ac_cv_header_stdint_x"
+elif  test "_$ac_cv_header_stdint_o" != "_" ; then
+   ac_cv_header_stdint="$ac_cv_header_stdint_o"
+elif  test "_$ac_cv_header_stdint_u" != "_" ; then
+   ac_cv_header_stdint="$ac_cv_header_stdint_u"
+else
+   ac_cv_header_stdint="stddef.h"
+fi
+
+AC_MSG_CHECKING([for extra inttypes in chosen header])
+AC_MSG_RESULT([($ac_cv_header_stdint)])
+dnl see if int_least and int_fast types are present in _this_ header.
+unset ac_cv_type_int_least32_t
+unset ac_cv_type_int_fast32_t
+AC_CHECK_TYPE(int_least32_t,,,[#include <$ac_cv_header_stdint>])
+AC_CHECK_TYPE(int_fast32_t,,,[#include<$ac_cv_header_stdint>])
+AC_CHECK_TYPE(intmax_t,,,[#include <$ac_cv_header_stdint>])
+
+fi # shortcircut to system "stdint.h"
+# ------------------ PREPARE VARIABLES ------------------------------
+if test "$GCC" = "yes" ; then
+ac_cv_stdint_message="using gnu compiler "`$CC --version | head -1`
+else
+ac_cv_stdint_message="using $CC"
+fi
+
+AC_MSG_RESULT([make use of $ac_cv_header_stdint in $ac_stdint_h dnl
+$ac_cv_stdint_result])
+
+dnl -----------------------------------------------------------------
+# ----------------- DONE inttypes.h checks START header -------------
+AC_CONFIG_COMMANDS([$ac_stdint_h],[
+AC_MSG_NOTICE(creating $ac_stdint_h : $_ac_stdint_h)
+ac_stdint=$tmp/_stdint.h
+
+echo "#ifndef" $_ac_stdint_h >$ac_stdint
+echo "#define" $_ac_stdint_h "1" >>$ac_stdint
+echo "#ifndef" _GENERATED_STDINT_H >>$ac_stdint
+echo "#define" _GENERATED_STDINT_H '"'$PACKAGE $VERSION'"' >>$ac_stdint
+echo "/* generated $ac_cv_stdint_message */" >>$ac_stdint
+if test "_$ac_cv_header_stdint_t" != "_" ; then
+echo "#define _STDINT_HAVE_STDINT_H" "1" >>$ac_stdint
+echo "#include <stdint.h>" >>$ac_stdint
+echo "#endif" >>$ac_stdint
+echo "#endif" >>$ac_stdint
+else
+
+cat >>$ac_stdint <<STDINT_EOF
+
+/* ................... shortcircuit part ........................... */
+
+#if defined HAVE_STDINT_H || defined _STDINT_HAVE_STDINT_H
+#include <stdint.h>
+#else
+#include <stddef.h>
+
+/* .................... configured part ............................ */
+
+STDINT_EOF
+
+echo "/* whether we have a C99 compatible stdint header file */" >>$ac_stdint
+if test "_$ac_cv_header_stdint_x" != "_" ; then
+  ac_header="$ac_cv_header_stdint_x"
+  echo "#define _STDINT_HEADER_INTPTR" '"'"$ac_header"'"' >>$ac_stdint
+else
+  echo "/* #undef _STDINT_HEADER_INTPTR */" >>$ac_stdint
+fi
+
+echo "/* whether we have a C96 compatible inttypes header file */" >>$ac_stdint
+if  test "_$ac_cv_header_stdint_o" != "_" ; then
+  ac_header="$ac_cv_header_stdint_o"
+  echo "#define _STDINT_HEADER_UINT32" '"'"$ac_header"'"' >>$ac_stdint
+else
+  echo "/* #undef _STDINT_HEADER_UINT32 */" >>$ac_stdint
+fi
+
+echo "/* whether we have a BSD compatible inet types header */" >>$ac_stdint
+if  test "_$ac_cv_header_stdint_u" != "_" ; then
+  ac_header="$ac_cv_header_stdint_u"
+  echo "#define _STDINT_HEADER_U_INT32" '"'"$ac_header"'"' >>$ac_stdint
+else
+  echo "/* #undef _STDINT_HEADER_U_INT32 */" >>$ac_stdint
+fi
+
+echo "" >>$ac_stdint
+
+if test "_$ac_header" != "_" ; then if test "$ac_header" != "stddef.h" ; then
+  echo "#include <$ac_header>" >>$ac_stdint
+  echo "" >>$ac_stdint
+fi fi
+
+echo "/* which 64bit typedef has been found */" >>$ac_stdint
+if test "$ac_cv_type_uint64_t" = "yes" ; then
+echo "#define   _STDINT_HAVE_UINT64_T" "1"  >>$ac_stdint
+else
+echo "/* #undef _STDINT_HAVE_UINT64_T */" >>$ac_stdint
+fi
+if test "$ac_cv_type_u_int64_t" = "yes" ; then
+echo "#define   _STDINT_HAVE_U_INT64_T" "1"  >>$ac_stdint
+else
+echo "/* #undef _STDINT_HAVE_U_INT64_T */" >>$ac_stdint
+fi
+echo "" >>$ac_stdint
+
+echo "/* which type model has been detected */" >>$ac_stdint
+if test "_$ac_cv_char_data_model" != "_" ; then
+echo "#define   _STDINT_CHAR_MODEL" "$ac_cv_char_data_model" >>$ac_stdint
+echo "#define   _STDINT_LONG_MODEL" "$ac_cv_long_data_model" >>$ac_stdint
+else
+echo "/* #undef _STDINT_CHAR_MODEL // skipped */" >>$ac_stdint
+echo "/* #undef _STDINT_LONG_MODEL // skipped */" >>$ac_stdint
+fi
+echo "" >>$ac_stdint
+
+echo "/* whether int_least types were detected */" >>$ac_stdint
+if test "$ac_cv_type_int_least32_t" = "yes"; then
+echo "#define   _STDINT_HAVE_INT_LEAST32_T" "1"  >>$ac_stdint
+else
+echo "/* #undef _STDINT_HAVE_INT_LEAST32_T */" >>$ac_stdint
+fi
+echo "/* whether int_fast types were detected */" >>$ac_stdint
+if test "$ac_cv_type_int_fast32_t" = "yes"; then
+echo "#define   _STDINT_HAVE_INT_FAST32_T" "1" >>$ac_stdint
+else
+echo "/* #undef _STDINT_HAVE_INT_FAST32_T */" >>$ac_stdint
+fi
+echo "/* whether intmax_t type was detected */" >>$ac_stdint
+if test "$ac_cv_type_intmax_t" = "yes"; then
+echo "#define   _STDINT_HAVE_INTMAX_T" "1" >>$ac_stdint
+else
+echo "/* #undef _STDINT_HAVE_INTMAX_T */" >>$ac_stdint
+fi
+echo "" >>$ac_stdint
+
+  cat >>$ac_stdint <<STDINT_EOF
+/* .................... detections part ............................ */
+
+/* whether we need to define bitspecific types from compiler base types */
+#ifndef _STDINT_HEADER_INTPTR
+#ifndef _STDINT_HEADER_UINT32
+#ifndef _STDINT_HEADER_U_INT32
+#define _STDINT_NEED_INT_MODEL_T
+#else
+#define _STDINT_HAVE_U_INT_TYPES
+#endif
+#endif
+#endif
+
+#ifdef _STDINT_HAVE_U_INT_TYPES
+#undef _STDINT_NEED_INT_MODEL_T
+#endif
+
+#ifdef  _STDINT_CHAR_MODEL
+#if     _STDINT_CHAR_MODEL+0 == 122 || _STDINT_CHAR_MODEL+0 == 124
+#ifndef _STDINT_BYTE_MODEL
+#define _STDINT_BYTE_MODEL 12
+#endif
+#endif
+#endif
+
+#ifndef _STDINT_HAVE_INT_LEAST32_T
+#define _STDINT_NEED_INT_LEAST_T
+#endif
+
+#ifndef _STDINT_HAVE_INT_FAST32_T
+#define _STDINT_NEED_INT_FAST_T
+#endif
+
+#ifndef _STDINT_HEADER_INTPTR
+#define _STDINT_NEED_INTPTR_T
+#ifndef _STDINT_HAVE_INTMAX_T
+#define _STDINT_NEED_INTMAX_T
+#endif
+#endif
+
+
+/* .................... definition part ............................ */
+
+/* some system headers have good uint64_t */
+#ifndef _HAVE_UINT64_T
+#if     defined _STDINT_HAVE_UINT64_T  || defined HAVE_UINT64_T
+#define _HAVE_UINT64_T
+#elif   defined _STDINT_HAVE_U_INT64_T || defined HAVE_U_INT64_T
+#define _HAVE_UINT64_T
+typedef u_int64_t uint64_t;
+#endif
+#endif
+
+#ifndef _HAVE_UINT64_T
+/* .. here are some common heuristics using compiler runtime specifics */
+#if defined __STDC_VERSION__ && defined __STDC_VERSION__ >= 199901L
+#define _HAVE_UINT64_T
+#define _HAVE_LONGLONG_UINT64_T
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+
+#elif !defined __STRICT_ANSI__
+#if defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__
+#define _HAVE_UINT64_T
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+
+#elif defined __GNUC__ || defined __MWERKS__ || defined __ELF__
+/* note: all ELF-systems seem to have loff-support which needs 64-bit */
+#if !defined _NO_LONGLONG
+#define _HAVE_UINT64_T
+#define _HAVE_LONGLONG_UINT64_T
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+#endif
+
+#elif defined __alpha || (defined __mips && defined _ABIN32)
+#if !defined _NO_LONGLONG
+typedef long int64_t;
+typedef unsigned long uint64_t;
+#endif
+  /* compiler/cpu type to define int64_t */
+#endif
+#endif
+#endif
+
+#if defined _STDINT_HAVE_U_INT_TYPES
+/* int8_t int16_t int32_t defined by inet code, redeclare the u_intXX types */
+typedef u_int8_t uint8_t;
+typedef u_int16_t uint16_t;
+typedef u_int32_t uint32_t;
+
+/* glibc compatibility */
+#ifndef __int8_t_defined
+#define __int8_t_defined
+#endif
+#endif
+
+#ifdef _STDINT_NEED_INT_MODEL_T
+/* we must guess all the basic types. Apart from byte-adressable system, */
+/* there a few 32-bit-only dsp-systems that we guard with BYTE_MODEL 8-} */
+/* (btw, those nibble-addressable systems are way off, or so we assume) */
+
+dnl   /* have a look at "64bit and data size neutrality" at */
+dnl   /* http://unix.org/version2/whatsnew/login_64bit.html */
+dnl   /* (the shorthand "ILP" types always have a "P" part) */
+
+#if defined _STDINT_BYTE_MODEL
+#if _STDINT_LONG_MODEL+0 == 242
+/* 2:4:2 =  IP16 = a normal 16-bit system                */
+typedef unsigned char   uint8_t;
+typedef unsigned short  uint16_t;
+typedef unsigned long   uint32_t;
+#ifndef __int8_t_defined
+#define __int8_t_defined
+typedef          char    int8_t;
+typedef          short   int16_t;
+typedef          long    int32_t;
+#endif
+#elif _STDINT_LONG_MODEL+0 == 244 || _STDINT_LONG_MODEL == 444
+/* 2:4:4 =  LP32 = a 32-bit system derived from a 16-bit */
+/* 4:4:4 = ILP32 = a normal 32-bit system                */
+typedef unsigned char   uint8_t;
+typedef unsigned short  uint16_t;
+typedef unsigned int    uint32_t;
+#ifndef __int8_t_defined
+#define __int8_t_defined
+typedef          char    int8_t;
+typedef          short   int16_t;
+typedef          int     int32_t;
+#endif
+#elif _STDINT_LONG_MODEL+0 == 484 || _STDINT_LONG_MODEL+0 == 488
+/* 4:8:4 =  IP32 = a 32-bit system prepared for 64-bit    */
+/* 4:8:8 =  LP64 = a normal 64-bit system                 */
+typedef unsigned char   uint8_t;
+typedef unsigned short  uint16_t;
+typedef unsigned int    uint32_t;
+#ifndef __int8_t_defined
+#define __int8_t_defined
+typedef          char    int8_t;
+typedef          short   int16_t;
+typedef          int     int32_t;
+#endif
+/* this system has a "long" of 64bit */
+#ifndef _HAVE_UINT64_T
+#define _HAVE_UINT64_T
+typedef unsigned long   uint64_t;
+typedef          long    int64_t;
+#endif
+#elif _STDINT_LONG_MODEL+0 == 448
+/*      LLP64   a 64-bit system derived from a 32-bit system */
+typedef unsigned char   uint8_t;
+typedef unsigned short  uint16_t;
+typedef unsigned int    uint32_t;
+#ifndef __int8_t_defined
+#define __int8_t_defined
+typedef          char    int8_t;
+typedef          short   int16_t;
+typedef          int     int32_t;
+#endif
+/* assuming the system has a "long long" */
+#ifndef _HAVE_UINT64_T
+#define _HAVE_UINT64_T
+#define _HAVE_LONGLONG_UINT64_T
+typedef unsigned long long uint64_t;
+typedef          long long  int64_t;
+#endif
+#else
+#define _STDINT_NO_INT32_T
+#endif
+#else
+#define _STDINT_NO_INT8_T
+#define _STDINT_NO_INT32_T
+#endif
+#endif
+
+/*
+ * quote from SunOS-5.8 sys/inttypes.h:
+ * Use at your own risk.  As of February 1996, the committee is squarely
+ * behind the fixed sized types; the "least" and "fast" types are still being
+ * discussed.  The probability that the "fast" types may be removed before
+ * the standard is finalized is high enough that they are not currently
+ * implemented.
+ */
+
+#if defined _STDINT_NEED_INT_LEAST_T
+typedef  int8_t    int_least8_t;
+typedef  int16_t   int_least16_t;
+typedef  int32_t   int_least32_t;
+#ifdef _HAVE_UINT64_T
+typedef  int64_t   int_least64_t;
+#endif
+
+typedef uint8_t   uint_least8_t;
+typedef uint16_t  uint_least16_t;
+typedef uint32_t  uint_least32_t;
+#ifdef _HAVE_UINT64_T
+typedef uint64_t  uint_least64_t;
+#endif
+  /* least types */
+#endif
+
+#if defined _STDINT_NEED_INT_FAST_T
+typedef  int8_t    int_fast8_t;
+typedef  int       int_fast16_t;
+typedef  int32_t   int_fast32_t;
+#ifdef _HAVE_UINT64_T
+typedef  int64_t   int_fast64_t;
+#endif
+
+typedef uint8_t   uint_fast8_t;
+typedef unsigned  uint_fast16_t;
+typedef uint32_t  uint_fast32_t;
+#ifdef _HAVE_UINT64_T
+typedef uint64_t  uint_fast64_t;
+#endif
+  /* fast types */
+#endif
+
+#ifdef _STDINT_NEED_INTMAX_T
+#ifdef _HAVE_UINT64_T
+typedef  int64_t       intmax_t;
+typedef uint64_t      uintmax_t;
+#else
+typedef          long  intmax_t;
+typedef unsigned long uintmax_t;
+#endif
+#endif
+
+#ifdef _STDINT_NEED_INTPTR_T
+#ifndef __intptr_t_defined
+#define __intptr_t_defined
+/* we encourage using "long" to store pointer values, never use "int" ! */
+#if   _STDINT_LONG_MODEL+0 == 242 || _STDINT_LONG_MODEL+0 == 484
+typedef  unsigned int   uintptr_t;
+typedef           int    intptr_t;
+#elif _STDINT_LONG_MODEL+0 == 244 || _STDINT_LONG_MODEL+0 == 444
+typedef  unsigned long  uintptr_t;
+typedef           long   intptr_t;
+#elif _STDINT_LONG_MODEL+0 == 448 && defined _HAVE_UINT64_T
+typedef        uint64_t uintptr_t;
+typedef         int64_t  intptr_t;
+#else /* matches typical system types ILP32 and LP64 - but not IP16 or LLP64 */
+typedef  unsigned long  uintptr_t;
+typedef           long   intptr_t;
+#endif
+#endif
+#endif
+
+/* The ISO C99 standard specifies that in C++ implementations these
+   should only be defined if explicitly requested.  */
+#if !defined __cplusplus || defined __STDC_CONSTANT_MACROS
+#ifndef UINT32_C
+
+/* Signed.  */
+# define INT8_C(c)      c
+# define INT16_C(c)     c
+# define INT32_C(c)     c
+# ifdef _HAVE_LONGLONG_UINT64_T
+#  define INT64_C(c)    c ## L
+# else
+#  define INT64_C(c)    c ## LL
+# endif
+
+/* Unsigned.  */
+# define UINT8_C(c)     c ## U
+# define UINT16_C(c)    c ## U
+# define UINT32_C(c)    c ## U
+# ifdef _HAVE_LONGLONG_UINT64_T
+#  define UINT64_C(c)   c ## UL
+# else
+#  define UINT64_C(c)   c ## ULL
+# endif
+
+/* Maximal type.  */
+# ifdef _HAVE_LONGLONG_UINT64_T
+#  define INTMAX_C(c)   c ## L
+#  define UINTMAX_C(c)  c ## UL
+# else
+#  define INTMAX_C(c)   c ## LL
+#  define UINTMAX_C(c)  c ## ULL
+# endif
+
+  /* literalnumbers */
+#endif
+#endif
+
+/* These limits are merily those of a two complement byte-oriented system */
+
+/* Minimum of signed integral types.  */
+# define INT8_MIN               (-128)
+# define INT16_MIN              (-32767-1)
+# define INT32_MIN              (-2147483647-1)
+# define INT64_MIN              (-__INT64_C(9223372036854775807)-1)
+/* Maximum of signed integral types.  */
+# define INT8_MAX               (127)
+# define INT16_MAX              (32767)
+# define INT32_MAX              (2147483647)
+# define INT64_MAX              (__INT64_C(9223372036854775807))
+
+/* Maximum of unsigned integral types.  */
+# define UINT8_MAX              (255)
+# define UINT16_MAX             (65535)
+# define UINT32_MAX             (4294967295U)
+# define UINT64_MAX             (__UINT64_C(18446744073709551615))
+
+/* Minimum of signed integral types having a minimum size.  */
+# define INT_LEAST8_MIN         INT8_MIN
+# define INT_LEAST16_MIN        INT16_MIN
+# define INT_LEAST32_MIN        INT32_MIN
+# define INT_LEAST64_MIN        INT64_MIN
+/* Maximum of signed integral types having a minimum size.  */
+# define INT_LEAST8_MAX         INT8_MAX
+# define INT_LEAST16_MAX        INT16_MAX
+# define INT_LEAST32_MAX        INT32_MAX
+# define INT_LEAST64_MAX        INT64_MAX
+
+/* Maximum of unsigned integral types having a minimum size.  */
+# define UINT_LEAST8_MAX        UINT8_MAX
+# define UINT_LEAST16_MAX       UINT16_MAX
+# define UINT_LEAST32_MAX       UINT32_MAX
+# define UINT_LEAST64_MAX       UINT64_MAX
+
+  /* shortcircuit*/
+#endif
+  /* once */
+#endif
+#endif
+STDINT_EOF
+fi
+    if cmp -s $ac_stdint_h $ac_stdint 2>/dev/null; then
+      AC_MSG_NOTICE([$ac_stdint_h is unchanged])
+    else
+      ac_dir=`AS_DIRNAME(["$ac_stdint_h"])`
+      AS_MKDIR_P(["$ac_dir"])
+      rm -f $ac_stdint_h
+      mv $ac_stdint $ac_stdint_h
+    fi
+],[# variables for create stdint.h replacement
+PACKAGE="$PACKAGE"
+VERSION="$VERSION"
+ac_stdint_h="$ac_stdint_h"
+_ac_stdint_h=AS_TR_CPP(_$PACKAGE-$ac_stdint_h)
+ac_cv_stdint_message="$ac_cv_stdint_message"
+ac_cv_header_stdint_t="$ac_cv_header_stdint_t"
+ac_cv_header_stdint_x="$ac_cv_header_stdint_x"
+ac_cv_header_stdint_o="$ac_cv_header_stdint_o"
+ac_cv_header_stdint_u="$ac_cv_header_stdint_u"
+ac_cv_type_uint64_t="$ac_cv_type_uint64_t"
+ac_cv_type_u_int64_t="$ac_cv_type_u_int64_t"
+ac_cv_char_data_model="$ac_cv_char_data_model"
+ac_cv_long_data_model="$ac_cv_long_data_model"
+ac_cv_type_int_least32_t="$ac_cv_type_int_least32_t"
+ac_cv_type_int_fast32_t="$ac_cv_type_int_fast32_t"
+ac_cv_type_intmax_t="$ac_cv_type_intmax_t"
+])
+])
diff --git a/nestegg/m4/pkg.m4 b/nestegg/m4/pkg.m4
new file mode 100644
index 000000000..996e29454
--- /dev/null
+++ b/nestegg/m4/pkg.m4
@@ -0,0 +1,157 @@
+# pkg.m4 - Macros to locate and utilise pkg-config.            -*- Autoconf -*-
+#
+# Copyright © 2004 Scott James Remnant <scott@netsplit.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# PKG_PROG_PKG_CONFIG([MIN-VERSION])
+# ----------------------------------
+AC_DEFUN([PKG_PROG_PKG_CONFIG],
+[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
+m4_pattern_allow([^PKG_CONFIG(_PATH)?$])
+AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl
+if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
+	AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
+fi
+if test -n "$PKG_CONFIG"; then
+	_pkg_min_version=m4_default([$1], [0.9.0])
+	AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
+	if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
+		AC_MSG_RESULT([yes])
+	else
+		AC_MSG_RESULT([no])
+		PKG_CONFIG=""
+	fi
+
+fi[]dnl
+])# PKG_PROG_PKG_CONFIG
+
+# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+#
+# Check to see whether a particular set of modules exists.  Similar
+# to PKG_CHECK_MODULES(), but does not set variables or print errors.
+#
+#
+# Similar to PKG_CHECK_MODULES, make sure that the first instance of
+# this or PKG_CHECK_MODULES is called, or make sure to call
+# PKG_CHECK_EXISTS manually
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_EXISTS],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+if test -n "$PKG_CONFIG" && \
+    AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
+  m4_ifval([$2], [$2], [:])
+m4_ifvaln([$3], [else
+  $3])dnl
+fi])
+
+
+# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+# ---------------------------------------------
+m4_define([_PKG_CONFIG],
+[if test -n "$PKG_CONFIG"; then
+    if test -n "$$1"; then
+        pkg_cv_[]$1="$$1"
+    else
+        PKG_CHECK_EXISTS([$3],
+                         [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`],
+			 [pkg_failed=yes])
+    fi
+else
+	pkg_failed=untried
+fi[]dnl
+])# _PKG_CONFIG
+
+# _PKG_SHORT_ERRORS_SUPPORTED
+# -----------------------------
+AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi[]dnl
+])# _PKG_SHORT_ERRORS_SUPPORTED
+
+
+# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+# [ACTION-IF-NOT-FOUND])
+#
+#
+# Note that if there is a possibility the first call to
+# PKG_CHECK_MODULES might not happen, you should be sure to include an
+# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+#
+#
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_MODULES],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
+AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
+
+pkg_failed=no
+AC_MSG_CHECKING([for $1])
+
+_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
+_PKG_CONFIG([$1][_LIBS], [libs], [$2])
+
+m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
+and $1[]_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.])
+
+if test $pkg_failed = yes; then
+        _PKG_SHORT_ERRORS_SUPPORTED
+        if test $_pkg_short_errors_supported = yes; then
+	        $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "$2"`
+        else
+	        $1[]_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "$2"`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
+
+	ifelse([$4], , [AC_MSG_ERROR(dnl
+[Package requirements ($2) were not met:
+
+$$1_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+_PKG_TEXT
+])],
+		[AC_MSG_RESULT([no])
+                $4])
+elif test $pkg_failed = untried; then
+	ifelse([$4], , [AC_MSG_FAILURE(dnl
+[The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+_PKG_TEXT
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.])],
+		[$4])
+else
+	$1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
+	$1[]_LIBS=$pkg_cv_[]$1[]_LIBS
+        AC_MSG_RESULT([yes])
+	ifelse([$3], , :, [$3])
+fi[]dnl
+])# PKG_CHECK_MODULES
diff --git a/nestegg/nestegg-uninstalled.pc.in b/nestegg/nestegg-uninstalled.pc.in
new file mode 100644
index 000000000..19bb680ac
--- /dev/null
+++ b/nestegg/nestegg-uninstalled.pc.in
@@ -0,0 +1,13 @@
+# nestegg uninstalled pkg-config file
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: nestegg
+Description: WebM/Matroska demuxer
+Version: @VERSION@
+Conflicts:
+Libs: -L${libdir} -lnestegg
+Cflags: -I${includedir}
diff --git a/nestegg/nestegg.pc.in b/nestegg/nestegg.pc.in
new file mode 100644
index 000000000..32c09d79d
--- /dev/null
+++ b/nestegg/nestegg.pc.in
@@ -0,0 +1,13 @@
+# nestegg installed pkg-config file
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: nestegg
+Description: WebM/Matroska demuxer
+Version: @VERSION@
+Conflicts:
+Libs: -L${libdir} -lnestegg
+Cflags: -I${includedir}
diff --git a/nestegg/src/nestegg.c b/nestegg/src/nestegg.c
new file mode 100644
index 000000000..63a0e83e5
--- /dev/null
+++ b/nestegg/src/nestegg.c
@@ -0,0 +1,1938 @@
+/*
+ * Copyright © 2010 Mozilla Foundation
+ *
+ * This program is made available under an ISC-style license.  See the
+ * accompanying file LICENSE for details.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "nestegg/halloc/halloc.h"
+#include "nestegg/include/nestegg/nestegg.h"
+
+/* EBML Elements */
+#define ID_EBML                 0x1a45dfa3
+#define ID_EBML_VERSION         0x4286
+#define ID_EBML_READ_VERSION    0x42f7
+#define ID_EBML_MAX_ID_LENGTH   0x42f2
+#define ID_EBML_MAX_SIZE_LENGTH 0x42f3
+#define ID_DOCTYPE              0x4282
+#define ID_DOCTYPE_VERSION      0x4287
+#define ID_DOCTYPE_READ_VERSION 0x4285
+
+/* Global Elements */
+#define ID_VOID                 0xec
+#define ID_CRC32                0xbf
+
+/* WebMedia Elements */
+#define ID_SEGMENT              0x18538067
+
+/* Seek Head Elements */
+#define ID_SEEK_HEAD            0x114d9b74
+#define ID_SEEK                 0x4dbb
+#define ID_SEEK_ID              0x53ab
+#define ID_SEEK_POSITION        0x53ac
+
+/* Info Elements */
+#define ID_INFO                 0x1549a966
+#define ID_TIMECODE_SCALE       0x2ad7b1
+#define ID_DURATION             0x4489
+
+/* Cluster Elements */
+#define ID_CLUSTER              0x1f43b675
+#define ID_TIMECODE             0xe7
+#define ID_BLOCK_GROUP          0xa0
+#define ID_SIMPLE_BLOCK         0xa3
+
+/* BlockGroup Elements */
+#define ID_BLOCK                0xa1
+#define ID_BLOCK_DURATION       0x9b
+#define ID_REFERENCE_BLOCK      0xfb
+
+/* Tracks Elements */
+#define ID_TRACKS               0x1654ae6b
+#define ID_TRACK_ENTRY          0xae
+#define ID_TRACK_NUMBER         0xd7
+#define ID_TRACK_UID            0x73c5
+#define ID_TRACK_TYPE           0x83
+#define ID_FLAG_ENABLED         0xb9
+#define ID_FLAG_DEFAULT         0x88
+#define ID_FLAG_LACING          0x9c
+#define ID_TRACK_TIMECODE_SCALE 0x23314f
+#define ID_LANGUAGE             0x22b59c
+#define ID_CODEC_ID             0x86
+#define ID_CODEC_PRIVATE        0x63a2
+
+/* Video Elements */
+#define ID_VIDEO                0xe0
+#define ID_PIXEL_WIDTH          0xb0
+#define ID_PIXEL_HEIGHT         0xba
+#define ID_PIXEL_CROP_BOTTOM    0x54aa
+#define ID_PIXEL_CROP_TOP       0x54bb
+#define ID_PIXEL_CROP_LEFT      0x54cc
+#define ID_PIXEL_CROP_RIGHT     0x54dd
+#define ID_DISPLAY_WIDTH        0x54b0
+#define ID_DISPLAY_HEIGHT       0x54ba
+
+/* Audio Elements */
+#define ID_AUDIO                0xe1
+#define ID_SAMPLING_FREQUENCY   0xb5
+#define ID_CHANNELS             0x9f
+#define ID_BIT_DEPTH            0x6264
+
+/* Cues Elements */
+#define ID_CUES                 0x1c53bb6b
+#define ID_CUE_POINT            0xbb
+#define ID_CUE_TIME             0xb3
+#define ID_CUE_TRACK_POSITIONS  0xb7
+#define ID_CUE_TRACK            0xf7
+#define ID_CUE_CLUSTER_POSITION 0xf1
+#define ID_CUE_BLOCK_NUMBER     0x5378
+
+/* EBML Types */
+enum ebml_type_enum {
+  TYPE_UNKNOWN,
+  TYPE_MASTER,
+  TYPE_UINT,
+  TYPE_FLOAT,
+  TYPE_INT,
+  TYPE_STRING,
+  TYPE_BINARY
+};
+
+#define LIMIT_STRING            (1 << 20)
+#define LIMIT_BINARY            (1 << 24)
+#define LIMIT_BLOCK             (1 << 30)
+#define LIMIT_FRAME             (1 << 28)
+
+/* Field Flags */
+#define DESC_FLAG_NONE          0
+#define DESC_FLAG_MULTI         (1 << 0)
+#define DESC_FLAG_SUSPEND       (1 << 1)
+#define DESC_FLAG_OFFSET        (1 << 2)
+
+/* Block Header Flags */
+#define BLOCK_FLAGS_LACING      6
+
+/* Lacing Constants */
+#define LACING_NONE             0
+#define LACING_XIPH             1
+#define LACING_FIXED            2
+#define LACING_EBML             3
+
+/* Track Types */
+#define TRACK_TYPE_VIDEO        1
+#define TRACK_TYPE_AUDIO        2
+
+/* Track IDs */
+#define TRACK_ID_VP8            "V_VP8"
+#define TRACK_ID_VORBIS         "A_VORBIS"
+
+enum vint_mask {
+  MASK_NONE,
+  MASK_FIRST_BIT
+};
+
+struct ebml_binary {
+  unsigned char * data;
+  size_t length;
+};
+
+struct ebml_list_node {
+  struct ebml_list_node * next;
+  uint64_t id;
+  void * data;
+};
+
+struct ebml_list {
+  struct ebml_list_node * head;
+  struct ebml_list_node * tail;
+};
+
+struct ebml_type {
+  union ebml_value {
+    uint64_t u;
+    double f;
+    int64_t i;
+    char * s;
+    struct ebml_binary b;
+  } v;
+  enum ebml_type_enum type;
+  int read;
+};
+
+/* EBML Definitions */
+struct ebml {
+  struct ebml_type ebml_version;
+  struct ebml_type ebml_read_version;
+  struct ebml_type ebml_max_id_length;
+  struct ebml_type ebml_max_size_length;
+  struct ebml_type doctype;
+  struct ebml_type doctype_version;
+  struct ebml_type doctype_read_version;
+};
+
+/* Matroksa Definitions */
+struct seek {
+  struct ebml_type id;
+  struct ebml_type position;
+};
+
+struct seek_head {
+  struct ebml_list seek;
+};
+
+struct info {
+  struct ebml_type timecode_scale;
+  struct ebml_type duration;
+};
+
+struct block_group {
+  struct ebml_type duration;
+  struct ebml_type reference_block;
+};
+
+struct cluster {
+  struct ebml_type timecode;
+  struct ebml_list block_group;
+};
+
+struct video {
+  struct ebml_type pixel_width;
+  struct ebml_type pixel_height;
+  struct ebml_type pixel_crop_bottom;
+  struct ebml_type pixel_crop_top;
+  struct ebml_type pixel_crop_left;
+  struct ebml_type pixel_crop_right;
+  struct ebml_type display_width;
+  struct ebml_type display_height;
+};
+
+struct audio {
+  struct ebml_type sampling_frequency;
+  struct ebml_type channels;
+  struct ebml_type bit_depth;
+};
+
+struct track_entry {
+  struct ebml_type number;
+  struct ebml_type uid;
+  struct ebml_type type;
+  struct ebml_type flag_enabled;
+  struct ebml_type flag_default;
+  struct ebml_type flag_lacing;
+  struct ebml_type track_timecode_scale;
+  struct ebml_type language;
+  struct ebml_type codec_id;
+  struct ebml_type codec_private;
+  struct video video;
+  struct audio audio;
+};
+
+struct tracks {
+  struct ebml_list track_entry;
+};
+
+struct cue_track_positions {
+  struct ebml_type track;
+  struct ebml_type cluster_position;
+  struct ebml_type block_number;
+};
+
+struct cue_point {
+  struct ebml_type time;
+  struct ebml_list cue_track_positions;
+};
+
+struct cues {
+  struct ebml_list cue_point;
+};
+
+struct segment {
+  struct ebml_list seek_head;
+  struct info info;
+  struct ebml_list cluster;
+  struct tracks tracks;
+  struct cues cues;
+};
+
+/* Misc. */
+struct pool_ctx {
+  char dummy;
+};
+
+struct list_node {
+  struct list_node * previous;
+  struct ebml_element_desc * node;
+  unsigned char * data;
+};
+
+struct saved_state {
+  int64_t stream_offset;
+  struct list_node * ancestor;
+  uint64_t last_id;
+  uint64_t last_size;
+};
+
+struct frame {
+  unsigned char * data;
+  size_t length;
+  struct frame * next;
+};
+
+/* Public (opaque) Structures */
+struct nestegg {
+  nestegg_io * io;
+  nestegg_log log;
+  struct pool_ctx * alloc_pool;
+  uint64_t last_id;
+  uint64_t last_size;
+  struct list_node * ancestor;
+  struct ebml ebml;
+  struct segment segment;
+  int64_t segment_offset;
+  unsigned int track_count;
+};
+
+struct nestegg_packet {
+  uint64_t track;
+  uint64_t timecode;
+  struct frame * frame;
+};
+
+/* Element Descriptor */
+struct ebml_element_desc {
+  char const * name;
+  uint64_t id;
+  enum ebml_type_enum type;
+  size_t offset;
+  unsigned int flags;
+  struct ebml_element_desc * children;
+  size_t size;
+  size_t data_offset;
+};
+
+#define E_FIELD(ID, TYPE, STRUCT, FIELD) \
+  { #ID, ID, TYPE, offsetof(STRUCT, FIELD), DESC_FLAG_NONE, NULL, 0, 0 }
+#define E_MASTER(ID, TYPE, STRUCT, FIELD) \
+  { #ID, ID, TYPE, offsetof(STRUCT, FIELD), DESC_FLAG_MULTI, ne_ ## FIELD ## _elements, \
+      sizeof(struct FIELD), 0 }
+#define E_SINGLE_MASTER_O(ID, TYPE, STRUCT, FIELD) \
+  { #ID, ID, TYPE, offsetof(STRUCT, FIELD), DESC_FLAG_OFFSET, ne_ ## FIELD ## _elements, 0, \
+      offsetof(STRUCT, FIELD ## _offset) }
+#define E_SINGLE_MASTER(ID, TYPE, STRUCT, FIELD) \
+  { #ID, ID, TYPE, offsetof(STRUCT, FIELD), DESC_FLAG_NONE, ne_ ## FIELD ## _elements, 0, 0 }
+#define E_SUSPEND(ID, TYPE) \
+  { #ID, ID, TYPE, 0, DESC_FLAG_SUSPEND, NULL, 0, 0 }
+#define E_LAST \
+  { NULL, 0, 0, 0, DESC_FLAG_NONE, NULL, 0, 0 }
+
+/* EBML Element Lists */
+static struct ebml_element_desc ne_ebml_elements[] = {
+  E_FIELD(ID_EBML_VERSION, TYPE_UINT, struct ebml, ebml_version),
+  E_FIELD(ID_EBML_READ_VERSION, TYPE_UINT, struct ebml, ebml_read_version),
+  E_FIELD(ID_EBML_MAX_ID_LENGTH, TYPE_UINT, struct ebml, ebml_max_id_length),
+  E_FIELD(ID_EBML_MAX_SIZE_LENGTH, TYPE_UINT, struct ebml, ebml_max_size_length),
+  E_FIELD(ID_DOCTYPE, TYPE_STRING, struct ebml, doctype),
+  E_FIELD(ID_DOCTYPE_VERSION, TYPE_UINT, struct ebml, doctype_version),
+  E_FIELD(ID_DOCTYPE_READ_VERSION, TYPE_UINT, struct ebml, doctype_read_version),
+  E_LAST
+};
+
+/* WebMedia Element Lists */
+static struct ebml_element_desc ne_seek_elements[] = {
+  E_FIELD(ID_SEEK_ID, TYPE_BINARY, struct seek, id),
+  E_FIELD(ID_SEEK_POSITION, TYPE_UINT, struct seek, position),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_seek_head_elements[] = {
+  E_MASTER(ID_SEEK, TYPE_MASTER, struct seek_head, seek),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_info_elements[] = {
+  E_FIELD(ID_TIMECODE_SCALE, TYPE_UINT, struct info, timecode_scale),
+  E_FIELD(ID_DURATION, TYPE_FLOAT, struct info, duration),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_block_group_elements[] = {
+  E_SUSPEND(ID_BLOCK, TYPE_BINARY),
+  E_FIELD(ID_BLOCK_DURATION, TYPE_UINT, struct block_group, duration),
+  E_FIELD(ID_REFERENCE_BLOCK, TYPE_INT, struct block_group, reference_block),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_cluster_elements[] = {
+  E_FIELD(ID_TIMECODE, TYPE_UINT, struct cluster, timecode),
+  E_MASTER(ID_BLOCK_GROUP, TYPE_MASTER, struct cluster, block_group),
+  E_SUSPEND(ID_SIMPLE_BLOCK, TYPE_BINARY),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_video_elements[] = {
+  E_FIELD(ID_PIXEL_WIDTH, TYPE_UINT, struct video, pixel_width),
+  E_FIELD(ID_PIXEL_HEIGHT, TYPE_UINT, struct video, pixel_height),
+  E_FIELD(ID_PIXEL_CROP_BOTTOM, TYPE_UINT, struct video, pixel_crop_bottom),
+  E_FIELD(ID_PIXEL_CROP_TOP, TYPE_UINT, struct video, pixel_crop_top),
+  E_FIELD(ID_PIXEL_CROP_LEFT, TYPE_UINT, struct video, pixel_crop_left),
+  E_FIELD(ID_PIXEL_CROP_RIGHT, TYPE_UINT, struct video, pixel_crop_right),
+  E_FIELD(ID_DISPLAY_WIDTH, TYPE_UINT, struct video, display_width),
+  E_FIELD(ID_DISPLAY_HEIGHT, TYPE_UINT, struct video, display_height),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_audio_elements[] = {
+  E_FIELD(ID_SAMPLING_FREQUENCY, TYPE_FLOAT, struct audio, sampling_frequency),
+  E_FIELD(ID_CHANNELS, TYPE_UINT, struct audio, channels),
+  E_FIELD(ID_BIT_DEPTH, TYPE_UINT, struct audio, bit_depth),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_track_entry_elements[] = {
+  E_FIELD(ID_TRACK_NUMBER, TYPE_UINT, struct track_entry, number),
+  E_FIELD(ID_TRACK_UID, TYPE_UINT, struct track_entry, uid),
+  E_FIELD(ID_TRACK_TYPE, TYPE_UINT, struct track_entry, type),
+  E_FIELD(ID_FLAG_ENABLED, TYPE_UINT, struct track_entry, flag_enabled),
+  E_FIELD(ID_FLAG_DEFAULT, TYPE_UINT, struct track_entry, flag_default),
+  E_FIELD(ID_FLAG_LACING, TYPE_UINT, struct track_entry, flag_lacing),
+  E_FIELD(ID_TRACK_TIMECODE_SCALE, TYPE_FLOAT, struct track_entry, track_timecode_scale),
+  E_FIELD(ID_LANGUAGE, TYPE_STRING, struct track_entry, language),
+  E_FIELD(ID_CODEC_ID, TYPE_STRING, struct track_entry, codec_id),
+  E_FIELD(ID_CODEC_PRIVATE, TYPE_BINARY, struct track_entry, codec_private),
+  E_SINGLE_MASTER(ID_VIDEO, TYPE_MASTER, struct track_entry, video),
+  E_SINGLE_MASTER(ID_AUDIO, TYPE_MASTER, struct track_entry, audio),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_tracks_elements[] = {
+  E_MASTER(ID_TRACK_ENTRY, TYPE_MASTER, struct tracks, track_entry),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_cue_track_positions_elements[] = {
+  E_FIELD(ID_CUE_TRACK, TYPE_UINT, struct cue_track_positions, track),
+  E_FIELD(ID_CUE_CLUSTER_POSITION, TYPE_UINT, struct cue_track_positions, cluster_position),
+  E_FIELD(ID_CUE_BLOCK_NUMBER, TYPE_UINT, struct cue_track_positions, block_number),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_cue_point_elements[] = {
+  E_FIELD(ID_CUE_TIME, TYPE_UINT, struct cue_point, time),
+  E_MASTER(ID_CUE_TRACK_POSITIONS, TYPE_MASTER, struct cue_point, cue_track_positions),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_cues_elements[] = {
+  E_MASTER(ID_CUE_POINT, TYPE_MASTER, struct cues, cue_point),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_segment_elements[] = {
+  E_MASTER(ID_SEEK_HEAD, TYPE_MASTER, struct segment, seek_head),
+  E_SINGLE_MASTER(ID_INFO, TYPE_MASTER, struct segment, info),
+  E_MASTER(ID_CLUSTER, TYPE_MASTER, struct segment, cluster),
+  E_SINGLE_MASTER(ID_TRACKS, TYPE_MASTER, struct segment, tracks),
+  E_SINGLE_MASTER(ID_CUES, TYPE_MASTER, struct segment, cues),
+  E_LAST
+};
+
+static struct ebml_element_desc ne_top_level_elements[] = {
+  E_SINGLE_MASTER(ID_EBML, TYPE_MASTER, nestegg, ebml),
+  E_SINGLE_MASTER_O(ID_SEGMENT, TYPE_MASTER, nestegg, segment),
+  E_LAST
+};
+
+#undef E_FIELD
+#undef E_MASTER
+#undef E_SINGLE_MASTER_O
+#undef E_SINGLE_MASTER
+#undef E_SUSPEND
+#undef E_LAST
+
+static struct pool_ctx *
+ne_pool_init(void)
+{
+  struct pool_ctx * pool;
+
+  pool = h_malloc(sizeof(*pool));
+  if (!pool)
+    abort();
+  return pool;
+}
+
+static void
+ne_pool_destroy(struct pool_ctx * pool)
+{
+  h_free(pool);
+}
+
+static void *
+ne_pool_alloc(size_t size, struct pool_ctx * pool)
+{
+  void * p;
+
+  p = h_malloc(size);
+  if (!p)
+    abort();
+  hattach(p, pool);
+  memset(p, 0, size);
+  return p;
+}
+
+static void *
+ne_alloc(size_t size)
+{
+  void * p;
+
+  p = calloc(1, size);
+  if (!p)
+    abort();
+  return p;
+}
+
+static int
+ne_io_read(nestegg_io * io, void * buffer, size_t length)
+{
+  return io->read(buffer, length, io->userdata);
+}
+
+static int
+ne_io_seek(nestegg_io * io, int64_t offset, int whence)
+{
+  return io->seek(offset, whence, io->userdata);
+}
+
+static int
+ne_io_read_skip(nestegg_io * io, size_t length)
+{
+  size_t get;
+  unsigned char buf[8192];
+  int r = 1;
+
+  while (length > 0) {
+    get = length < sizeof(buf) ? length : sizeof(buf);
+    r = ne_io_read(io, buf, get);
+    if (r != 1)
+      break;
+    length -= get;
+  }
+
+  return r;
+}
+
+static int64_t
+ne_io_tell(nestegg_io * io)
+{
+  return io->tell(io->userdata);
+}
+
+static int
+ne_bare_read_vint(nestegg_io * io, uint64_t * value, uint64_t * length, enum vint_mask maskflag)
+{
+  int r;
+  unsigned char b;
+  size_t maxlen = 8;
+  unsigned int count = 1, mask = 1 << 7;
+
+  r = ne_io_read(io, &b, 1);
+  if (r != 1)
+    return r;
+
+  while (count < maxlen) {
+    if ((b & mask) != 0)
+      break;
+    mask >>= 1;
+    count += 1;
+  }
+
+  if (length)
+    *length = count;
+  *value = b;
+
+  if (maskflag == MASK_FIRST_BIT)
+    *value = b & ~mask;
+
+  while (--count) {
+    r = ne_io_read(io, &b, 1);
+    if (r != 1)
+      return r;
+    *value <<= 8;
+    *value |= b;
+  }
+
+  return 1;
+}
+
+static int
+ne_read_id(nestegg_io * io, uint64_t * value, uint64_t * length)
+{
+  return ne_bare_read_vint(io, value, length, MASK_NONE);
+}
+
+static int
+ne_read_vint(nestegg_io * io, uint64_t * value, uint64_t * length)
+{
+  return ne_bare_read_vint(io, value, length, MASK_FIRST_BIT);
+}
+
+static int
+ne_read_svint(nestegg_io * io, int64_t * value, uint64_t * length)
+{
+  int r;
+  uint64_t uvalue;
+  uint64_t ulength;
+  int64_t svint_subtr[] = {
+    0x3f, 0x1fff,
+    0xfffff, 0x7ffffff,
+    0x3ffffffffLL, 0x1ffffffffffLL,
+    0xffffffffffffLL, 0x7fffffffffffffLL
+  };
+
+  r = ne_bare_read_vint(io, &uvalue, &ulength, MASK_FIRST_BIT);
+  if (r != 1)
+    return r;
+  *value = uvalue - svint_subtr[ulength - 1];
+  if (length)
+    *length = ulength;
+  return r;
+}
+
+static int
+ne_read_uint(nestegg_io * io, uint64_t * val, uint64_t length)
+{
+  unsigned char b;
+  int r;
+
+  if (length == 0 || length > 8)
+    return -1;
+  r = ne_io_read(io, &b, 1);
+  if (r != 1)
+    return r;
+  *val = b;
+  while (--length) {
+    r = ne_io_read(io, &b, 1);
+    if (r != 1)
+      return r;
+    *val <<= 8;
+    *val |= b;
+  }
+  return 1;
+}
+
+static int
+ne_read_int(nestegg_io * io, int64_t * val, uint64_t length)
+{
+  int r;
+  uint64_t uval, base;
+
+  r = ne_read_uint(io, &uval, length);
+  if (r != 1)
+    return r;
+
+  if (length < sizeof(int64_t)) {
+    base = 1;
+    base <<= length * 8 - 1;
+    if (uval >= base) {
+        base = 1;
+        base <<= length * 8;
+    } else {
+      base = 0;
+    }
+    *val = uval - base;
+  } else {
+    *val = (int64_t) uval;
+  }
+
+  return 1;
+}
+
+static int
+ne_read_float(nestegg_io * io, double * val, uint64_t length)
+{
+  union {
+    uint64_t u;
+    float f;
+    double d;
+  } value;
+  int r;
+
+  /* length == 10 not implemented */
+  if (length != 4 && length != 8)
+    return -1;
+  r = ne_read_uint(io, &value.u, length);
+  if (r != 1)
+    return r;
+  if (length == 4)
+    *val = value.f;
+  else
+    *val = value.d;
+  return 1;
+}
+
+static int
+ne_read_string(nestegg * ctx, char ** val, uint64_t length)
+{
+  char * str;
+  int r;
+
+  if (length == 0 || length > LIMIT_STRING)
+    return -1;
+  str = ne_pool_alloc(length + 1, ctx->alloc_pool);
+  r = ne_io_read(ctx->io, (unsigned char *) str, length);
+  if (r != 1)
+    return r;
+  str[length] = '\0';
+  *val = str;
+  return 1;
+}
+
+static int
+ne_read_binary(nestegg * ctx, struct ebml_binary * val, uint64_t length)
+{
+  if (length == 0 || length > LIMIT_BINARY)
+    return -1;
+  val->data = ne_pool_alloc(length, ctx->alloc_pool);
+  val->length = length;
+  return ne_io_read(ctx->io, val->data, length);
+}
+
+static int
+ne_get_uint(struct ebml_type type, uint64_t * value)
+{
+  if (!type.read)
+    return -1;
+
+  assert(type.type == TYPE_UINT);
+
+  *value = type.v.u;
+
+  return 0;
+}
+
+static int
+ne_get_float(struct ebml_type type, double * value)
+{
+  if (!type.read)
+    return -1;
+
+  assert(type.type == TYPE_FLOAT);
+
+  *value = type.v.f;
+
+  return 0;
+}
+
+static int
+ne_get_string(struct ebml_type type, char ** value)
+{
+  if (!type.read)
+    return -1;
+
+  assert(type.type == TYPE_STRING);
+
+  *value = type.v.s;
+
+  return 0;
+}
+
+static int
+ne_get_binary(struct ebml_type type, struct ebml_binary * value)
+{
+  if (!type.read)
+    return -1;
+
+  assert(type.type == TYPE_BINARY);
+
+  *value = type.v.b;
+
+  return 0;
+}
+
+static int
+ne_is_ancestor_element(uint64_t id, struct list_node * ancestor)
+{
+  struct ebml_element_desc * element;
+
+  for (; ancestor; ancestor = ancestor->previous)
+    for (element = ancestor->node; element->id; ++element)
+      if (element->id == id)
+        return 1;
+
+  return 0;
+}
+
+static struct ebml_element_desc *
+ne_find_element(uint64_t id, struct ebml_element_desc * elements)
+{
+  struct ebml_element_desc * element;
+
+  for (element = elements; element->id; ++element)
+    if (element->id == id)
+      return element;
+
+  return NULL;
+}
+
+static void
+ne_ctx_push(nestegg * ctx, struct ebml_element_desc * ancestor, void * data)
+{
+  struct list_node * item;
+
+  item = ne_alloc(sizeof(*item));
+  item->previous = ctx->ancestor;
+  item->node = ancestor;
+  item->data = data;
+  ctx->ancestor = item;
+}
+
+static void
+ne_ctx_pop(nestegg * ctx)
+{
+  struct list_node * item;
+
+  item = ctx->ancestor;
+  ctx->ancestor = item->previous;
+  free(item);
+}
+
+static int
+ne_ctx_save(nestegg * ctx, struct saved_state * s)
+{
+  s->stream_offset = ne_io_tell(ctx->io);
+  if (s->stream_offset < 0)
+    return -1;
+  s->ancestor = ctx->ancestor;
+  s->last_id = ctx->last_id;
+  s->last_size = ctx->last_size;
+  return 0;
+}
+
+static int
+ne_ctx_restore(nestegg * ctx, struct saved_state * s)
+{
+  int r;
+
+  r = ne_io_seek(ctx->io, s->stream_offset, NESTEGG_SEEK_SET);
+  if (r != 0)
+    return -1;
+  ctx->ancestor = s->ancestor;
+  ctx->last_id = s->last_id;
+  ctx->last_size = s->last_size;
+  return 0;
+}
+
+static int
+ne_peek_element(nestegg * ctx, uint64_t * id, uint64_t * size)
+{
+  int r;
+
+  if (ctx->last_id && ctx->last_size) {
+    if (id)
+      *id = ctx->last_id;
+    if (size)
+      *size = ctx->last_size;
+    return 1;
+  }
+
+  r = ne_read_id(ctx->io, &ctx->last_id, NULL);
+  if (r != 1)
+    return r;
+
+  r = ne_read_vint(ctx->io, &ctx->last_size, NULL);
+  if (r != 1)
+    return r;
+
+  if (id)
+    *id = ctx->last_id;
+  if (size)
+    *size = ctx->last_size;
+
+  return 1;
+}
+
+static int
+ne_read_element(nestegg * ctx, uint64_t * id, uint64_t * size)
+{
+  int r;
+
+  r = ne_peek_element(ctx, id, size);
+  if (r != 1)
+    return r;
+
+  ctx->last_id = 0;
+  ctx->last_size = 0;
+
+  return 1;
+}
+
+static void
+ne_read_master(nestegg * ctx, struct ebml_element_desc * desc)
+{
+  struct ebml_list * list;
+  struct ebml_list_node * node, * oldtail;
+
+  assert(desc->type == TYPE_MASTER && desc->flags & DESC_FLAG_MULTI);
+
+  ctx->log(ctx, NESTEGG_LOG_DEBUG, "multi master element %llx (%s)",
+           desc->id, desc->name);
+
+  list = (struct ebml_list *) (ctx->ancestor->data + desc->offset);
+
+  node = ne_pool_alloc(sizeof(*node), ctx->alloc_pool);
+  node->id = desc->id;
+  node->data = ne_pool_alloc(desc->size, ctx->alloc_pool);
+
+  oldtail = list->tail;
+  if (oldtail)
+    oldtail->next = node;
+  list->tail = node;
+  if (!list->head)
+    list->head = node;
+
+  ctx->log(ctx, NESTEGG_LOG_DEBUG, " -> using data %p", node->data);
+
+  ne_ctx_push(ctx, desc->children, node->data);
+}
+
+static void
+ne_read_single_master(nestegg * ctx, struct ebml_element_desc * desc)
+{
+  assert(desc->type == TYPE_MASTER && !(desc->flags & DESC_FLAG_MULTI));
+
+  ctx->log(ctx, NESTEGG_LOG_DEBUG, "single master element %llx (%s)",
+           desc->id, desc->name);
+  ctx->log(ctx, NESTEGG_LOG_DEBUG, " -> using data %p (%u)",
+           ctx->ancestor->data + desc->offset, desc->offset);
+
+  ne_ctx_push(ctx, desc->children, ctx->ancestor->data + desc->offset);
+}
+
+static int
+ne_read_simple(nestegg * ctx, struct ebml_element_desc * desc, size_t length)
+{
+  struct ebml_type * storage;
+  int r;
+
+  storage = (struct ebml_type *) (ctx->ancestor->data + desc->offset);
+
+  if (storage->read) {
+    ctx->log(ctx, NESTEGG_LOG_DEBUG, "element %llx (%s) already read, skipping",
+             desc->id, desc->name);
+    return 0;
+  }
+
+  storage->type = desc->type;
+
+  ctx->log(ctx, NESTEGG_LOG_DEBUG, "element %llx (%s) -> %p (%u)",
+           desc->id, desc->name, storage, desc->offset);
+
+  r = -1;
+
+  switch (desc->type) {
+  case TYPE_UINT:
+    r = ne_read_uint(ctx->io, &storage->v.u, length);
+    break;
+  case TYPE_FLOAT:
+    r = ne_read_float(ctx->io, &storage->v.f, length);
+    break;
+  case TYPE_INT:
+    r = ne_read_int(ctx->io, &storage->v.i, length);
+    break;
+  case TYPE_STRING:
+    r = ne_read_string(ctx, &storage->v.s, length);
+    break;
+  case TYPE_BINARY:
+    r = ne_read_binary(ctx, &storage->v.b, length);
+    break;
+  case TYPE_MASTER:
+  case TYPE_UNKNOWN:
+    assert(0);
+    break;
+  }
+
+  if (r == 1)
+    storage->read = 1;
+
+  return r;
+}
+
+static int
+ne_parse(nestegg * ctx, struct ebml_element_desc * top_level)
+{
+  int r;
+  int64_t * data_offset;
+  uint64_t id, size;
+  struct ebml_element_desc * element;
+
+  /* loop until we need to return:
+     - hit suspend point
+     - parse complete
+     - error occurred */
+
+  /* loop over elements at current level reading them if sublevel found,
+     push ctx onto stack and continue if sublevel ended, pop ctx off stack
+     and continue */
+
+  if (!ctx->ancestor)
+    return -1;
+
+  for (;;) {
+    r = ne_peek_element(ctx, &id, &size);
+    if (r != 1)
+      break;
+
+    element = ne_find_element(id, ctx->ancestor->node);
+    if (element) {
+      if (element->flags & DESC_FLAG_SUSPEND) {
+        assert(element->type == TYPE_BINARY);
+        ctx->log(ctx, NESTEGG_LOG_DEBUG, "suspend parse at %llx", id);
+        r = 1;
+        break;
+      }
+
+      r = ne_read_element(ctx, &id, &size);
+      if (r != 1)
+        break;
+
+      if (element->flags & DESC_FLAG_OFFSET) {
+        data_offset = (int64_t *) (ctx->ancestor->data + element->data_offset);
+        *data_offset = ne_io_tell(ctx->io);
+        if (*data_offset < 0) {
+          r = -1;
+          break;
+        }
+      }
+
+      if (element->type == TYPE_MASTER) {
+        if (element->flags & DESC_FLAG_MULTI)
+          ne_read_master(ctx, element);
+        else
+          ne_read_single_master(ctx, element);
+        continue;
+      } else {
+        r = ne_read_simple(ctx, element, size);
+        if (r < 0)
+          break;
+      }
+    } else if (ne_is_ancestor_element(id, ctx->ancestor->previous)) {
+      ctx->log(ctx, NESTEGG_LOG_DEBUG, "parent element %llx", id);
+      if (top_level && ctx->ancestor->node == top_level) {
+        ctx->log(ctx, NESTEGG_LOG_DEBUG, "*** parse about to back up past top_level");
+        r = 1;
+        break;
+      }
+      ne_ctx_pop(ctx);
+    } else {
+      r = ne_read_element(ctx, &id, &size);
+      if (r != 1)
+        break;
+
+      if (id != ID_VOID && id != ID_CRC32)
+        ctx->log(ctx, NESTEGG_LOG_DEBUG, "unknown element %llx", id);
+      r = ne_io_read_skip(ctx->io, size);
+      if (r != 1)
+        break;
+    }
+  }
+
+  if (r != 1)
+    while (ctx->ancestor)
+      ne_ctx_pop(ctx);
+
+  return r;
+}
+
+static uint64_t
+ne_xiph_lace_value(unsigned char ** np)
+{
+  uint64_t lace;
+  uint64_t value;
+  unsigned char * p = *np;
+
+  lace = *p++;
+  value = lace;
+  while (lace == 255) {
+    lace = *p++;
+    value += lace;
+  }
+
+  *np = p;
+
+  return value;
+}
+
+static int
+ne_read_xiph_lace_value(nestegg_io * io, uint64_t * value, size_t * consumed)
+{
+  int r;
+  uint64_t lace;
+
+  r = ne_read_uint(io, &lace, 1);
+  if (r != 1)
+    return r;
+  *consumed += 1;
+
+  *value = lace;
+  while (lace == 255) {
+    r = ne_read_uint(io, &lace, 1);
+    if (r != 1)
+      return r;
+    *consumed += 1;
+    *value += lace;
+  }
+
+  return 1;
+}
+
+static int
+ne_read_xiph_lacing(nestegg_io * io, size_t block, size_t * read, uint64_t n, uint64_t * sizes)
+{
+  int r;
+  size_t i = 0;
+  uint64_t sum = 0;
+
+  while (--n) {
+    r = ne_read_xiph_lace_value(io, &sizes[i], read);
+    if (r != 1)
+      return r;
+    sum += sizes[i];
+    i += 1;
+  }
+
+  if (*read + sum > block)
+    return -1;
+
+  /* last frame is the remainder of the block */
+  sizes[i] = block - *read - sum;
+  return 1;
+}
+
+static int
+ne_read_ebml_lacing(nestegg_io * io, size_t block, size_t * read, uint64_t n, uint64_t * sizes)
+{
+  int r;
+  uint64_t lace, sum, length;
+  int64_t slace;
+  size_t i = 0;
+
+  r = ne_read_vint(io, &lace, &length);
+  if (r != 1)
+    return r;
+  *read += length;
+
+  sizes[i] = lace;
+  sum = sizes[i];
+
+  i += 1;
+  n -= 1;
+
+  while (--n) {
+    r = ne_read_svint(io, &slace, &length);
+    if (r != 1)
+      return r;
+    *read += length;
+    sizes[i] = sizes[i - 1] + slace;
+    sum += sizes[i];
+    i += 1;
+  }
+
+  if (*read + sum > block)
+    return -1;
+
+  /* last frame is the remainder of the block */
+  sizes[i] = block - *read - sum;
+  return 1;
+}
+
+static uint64_t
+ne_get_timecode_scale(nestegg * ctx)
+{
+  uint64_t scale;
+
+  if (ne_get_uint(ctx->segment.info.timecode_scale, &scale) != 0)
+    scale = 1000000;
+
+  return scale;
+}
+
+static struct track_entry *
+ne_find_track_entry(nestegg * ctx, unsigned int track)
+{
+  struct ebml_list_node * node;
+  unsigned int tracks = 0;
+
+  node = ctx->segment.tracks.track_entry.head;
+  while (node) {
+    assert(node->id == ID_TRACK_ENTRY);
+    if (track == tracks)
+      return node->data;
+    tracks += 1;
+    node = node->next;
+  }
+
+  return NULL;
+}
+
+static int
+ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_packet ** data)
+{
+  int r;
+  int64_t timecode, abs_timecode;
+  nestegg_packet * pkt;
+  struct cluster * cluster;
+  struct frame * f, * last;
+  struct track_entry * entry;
+  double track_scale;
+  uint64_t track, length, frame_sizes[256], cluster_tc, flags, frames, tc_scale, total;
+  unsigned int i, lacing;
+  size_t consumed = 0;
+
+  *data = NULL;
+
+  if (block_size > LIMIT_BLOCK)
+    return -1;
+
+  r = ne_read_vint(ctx->io, &track, &length);
+  if (r != 1)
+    return r;
+
+  if (track == 0 || track > ctx->track_count)
+    return -1;
+
+  consumed += length;
+
+  r = ne_read_int(ctx->io, &timecode, 2);
+  if (r != 1)
+    return r;
+
+  consumed += 2;
+
+  r = ne_read_uint(ctx->io, &flags, 1);
+  if (r != 1)
+    return r;
+
+  consumed += 1;
+
+  frames = 0;
+
+  /* flags are different between block and simpleblock, but lacing is
+     encoded the same way */
+  lacing = (flags & BLOCK_FLAGS_LACING) >> 1;
+
+  switch (lacing) {
+  case LACING_NONE:
+    frames = 1;
+    break;
+  case LACING_XIPH:
+  case LACING_FIXED:
+  case LACING_EBML:
+    r = ne_read_uint(ctx->io, &frames, 1);
+    if (r != 1)
+      return r;
+    consumed += 1;
+    frames += 1;
+  }
+
+  if (frames > 256)
+    return -1;
+
+  switch (lacing) {
+  case LACING_NONE:
+    frame_sizes[0] = block_size - consumed;
+    break;
+  case LACING_XIPH:
+    if (frames == 1)
+      return -1;
+    r = ne_read_xiph_lacing(ctx->io, block_size, &consumed, frames, frame_sizes);
+    if (r != 1)
+      return r;
+    break;
+  case LACING_FIXED:
+    if ((block_size - consumed) % frames)
+      return -1;
+    for (i = 0; i < frames; ++i)
+      frame_sizes[i] = (block_size - consumed) / frames;
+    break;
+  case LACING_EBML:
+    if (frames == 1)
+      return -1;
+    r = ne_read_ebml_lacing(ctx->io, block_size, &consumed, frames, frame_sizes);
+    if (r != 1)
+      return r;
+    break;
+  }
+
+  /* sanity check unlaced frame sizes against total block size. */
+  total = consumed;
+  for (i = 0; i < frames; ++i)
+    total += frame_sizes[i];
+  if (total > block_size)
+    return -1;
+
+  entry = ne_find_track_entry(ctx, track - 1);
+  if (!entry)
+    return -1;
+
+  track_scale = 1.0;
+
+  tc_scale = ne_get_timecode_scale(ctx);
+
+  assert(ctx->segment.cluster.tail->id == ID_CLUSTER);
+  cluster = ctx->segment.cluster.tail->data;
+  if (ne_get_uint(cluster->timecode, &cluster_tc) != 0)
+    return -1;
+
+  abs_timecode = timecode + cluster_tc;
+  if (abs_timecode < 0)
+    return -1;
+
+  pkt = ne_alloc(sizeof(*pkt));
+  pkt->track = track - 1;
+  pkt->timecode = abs_timecode * tc_scale * track_scale;
+
+  ctx->log(ctx, NESTEGG_LOG_DEBUG, "%sblock t %lld pts %f f %llx frames: %llu",
+           block_id == ID_BLOCK ? "" : "simple", pkt->track, pkt->timecode / 1e9, flags, frames);
+
+  last = NULL;
+  for (i = 0; i < frames; ++i) {
+    if (frame_sizes[i] > LIMIT_FRAME) {
+      nestegg_free_packet(pkt);
+      return -1;
+    }
+    f = ne_alloc(sizeof(*f));
+    f->data = ne_alloc(frame_sizes[i]);
+    f->length = frame_sizes[i];
+    r = ne_io_read(ctx->io, f->data, frame_sizes[i]);
+    if (r != 1) {
+      free(f->data);
+      free(f);
+      nestegg_free_packet(pkt);
+      return -1;
+    }
+
+    if (!last)
+      pkt->frame = f;
+    else
+      last->next = f;
+    last = f;
+  }
+
+  *data = pkt;
+
+  return 1;
+}
+
+static uint64_t
+ne_buf_read_id(unsigned char const * p, size_t length)
+{
+  uint64_t id = 0;
+
+  while (length--) {
+    id <<= 8;
+    id |= *p++;
+  }
+
+  return id;
+}
+
+static struct seek *
+ne_find_seek_for_id(struct ebml_list_node * seek_head, uint64_t id)
+{
+  struct ebml_list * head;
+  struct ebml_list_node * seek;
+  struct ebml_binary binary_id;
+  struct seek * s;
+
+  while (seek_head) {
+    assert(seek_head->id == ID_SEEK_HEAD);
+    head = seek_head->data;
+    seek = head->head;
+
+    while (seek) {
+      assert(seek->id == ID_SEEK);
+      s = seek->data;
+
+      if (ne_get_binary(s->id, &binary_id) == 0 &&
+          ne_buf_read_id(binary_id.data, binary_id.length) == id)
+        return s;
+
+      seek = seek->next;
+    }
+
+    seek_head = seek_head->next;
+  }
+
+  return NULL;
+}
+
+static struct cue_point *
+ne_find_cue_point_for_tstamp(struct ebml_list_node * cue_point, uint64_t scale, uint64_t tstamp)
+{
+  uint64_t time;
+  struct cue_point * c, * prev = NULL;
+
+  while (cue_point) {
+    assert(cue_point->id == ID_CUE_POINT);
+    c = cue_point->data;
+
+    if (!prev)
+      prev = c;
+
+    if (ne_get_uint(c->time, &time) == 0 && time * scale > tstamp)
+      break;
+
+    prev = cue_point->data;
+    cue_point = cue_point->next;
+  }
+
+  return prev;
+}
+
+static int
+ne_is_suspend_element(uint64_t id)
+{
+  /* this could search the tree of elements for DESC_FLAG_SUSPEND */
+  if (id == ID_SIMPLE_BLOCK || id == ID_BLOCK)
+    return 1;
+  return 0;
+}
+
+static void
+ne_null_log_callback(nestegg * ctx, unsigned int severity, char const * fmt, ...)
+{
+  if (ctx && severity && fmt)
+    return;
+}
+
+int
+nestegg_init(nestegg ** context, nestegg_io io, nestegg_log callback)
+{
+  int r;
+  uint64_t id, version, docversion;
+  struct ebml_list_node * track;
+  char * doctype;
+  nestegg * ctx = NULL;
+
+  if (!(io.read && io.seek && io.tell))
+    return -1;
+
+  ctx = ne_alloc(sizeof(*ctx));
+
+  ctx->io = ne_alloc(sizeof(*ctx->io));
+  *ctx->io = io;
+  ctx->log = callback;
+  ctx->alloc_pool = ne_pool_init();
+
+  if (!ctx->log)
+    ctx->log = ne_null_log_callback;
+
+  r = ne_peek_element(ctx, &id, NULL);
+  if (r != 1) {
+    nestegg_destroy(ctx);
+    return -1;
+  }
+
+  if (id != ID_EBML) {
+    nestegg_destroy(ctx);
+    return -1;
+  }
+
+  ctx->log(ctx, NESTEGG_LOG_DEBUG, "ctx %p", ctx);
+
+  ne_ctx_push(ctx, ne_top_level_elements, ctx);
+
+  r = ne_parse(ctx, NULL);
+
+  if (r != 1) {
+    nestegg_destroy(ctx);
+    return -1;
+  }
+
+  if (ne_get_uint(ctx->ebml.ebml_read_version, &version) != 0)
+    version = 1;
+  if (version != 1) {
+    nestegg_destroy(ctx);
+    return -1;
+  }
+
+  if (ne_get_string(ctx->ebml.doctype, &doctype) != 0)
+    doctype = "matroska";
+  if (strcmp(doctype, "webm") != 0) {
+    nestegg_destroy(ctx);
+    return -1;
+  }
+
+  if (ne_get_uint(ctx->ebml.doctype_read_version, &docversion) != 0)
+    docversion = 1;
+  if (docversion < 1 || docversion > 2) {
+    nestegg_destroy(ctx);
+    return -1;
+  }
+
+  if (!ctx->segment.tracks.track_entry.head) {
+    nestegg_destroy(ctx);
+    return -1;
+  }
+
+  track = ctx->segment.tracks.track_entry.head;
+  ctx->track_count = 0;
+
+  while (track) {
+    ctx->track_count += 1;
+    track = track->next;
+  }
+
+  *context = ctx;
+
+  return 0;
+}
+
+void
+nestegg_destroy(nestegg * ctx)
+{
+  while (ctx->ancestor)
+    ne_ctx_pop(ctx);
+  ne_pool_destroy(ctx->alloc_pool);
+  free(ctx->io);
+  free(ctx);
+}
+
+int
+nestegg_duration(nestegg * ctx, uint64_t * duration)
+{
+  uint64_t tc_scale;
+  double unscaled_duration;
+
+  if (ne_get_float(ctx->segment.info.duration, &unscaled_duration) != 0)
+    return -1;
+
+  tc_scale = ne_get_timecode_scale(ctx);
+
+  *duration = (uint64_t) (unscaled_duration * tc_scale);
+  return 0;
+}
+
+int
+nestegg_tstamp_scale(nestegg * ctx, uint64_t * scale)
+{
+  *scale = ne_get_timecode_scale(ctx);
+  return 0;
+}
+
+int
+nestegg_track_count(nestegg * ctx, unsigned int * tracks)
+{
+  *tracks = ctx->track_count;
+  return 0;
+}
+
+int
+nestegg_track_seek(nestegg * ctx, unsigned int track, uint64_t tstamp)
+{
+  int r;
+  struct cue_point * cue_point;
+  struct cue_track_positions * pos;
+  struct saved_state state;
+  struct seek * found;
+  uint64_t seek_pos, tc_scale, t, id;
+  struct ebml_list_node * node = ctx->segment.cues.cue_point.head;
+
+  /* If there are no cues loaded, check for cues element in the seek head
+     and load it. */
+  if (!node) {
+    found = ne_find_seek_for_id(ctx->segment.seek_head.head, ID_CUES);
+    if (!found)
+      return -1;
+
+    if (ne_get_uint(found->position, &seek_pos) != 0)
+      return -1;
+
+    /* Save old parser state. */
+    r = ne_ctx_save(ctx, &state);
+    if (r != 0)
+      return -1;
+
+    /* Seek and set up parser state for segment-level element (Cues). */
+    r = ne_io_seek(ctx->io, ctx->segment_offset + seek_pos, NESTEGG_SEEK_SET);
+    if (r != 0)
+      return -1;
+    ctx->last_id = 0;
+    ctx->last_size = 0;
+
+    r = ne_read_element(ctx, &id, NULL);
+    if (r != 1)
+      return -1;
+
+    if (id != ID_CUES)
+      return -1;
+
+    ctx->ancestor = NULL;
+    ne_ctx_push(ctx, ne_top_level_elements, ctx);
+    ne_ctx_push(ctx, ne_segment_elements, &ctx->segment);
+    ne_ctx_push(ctx, ne_cues_elements, &ctx->segment.cues);
+    /* parser will run until end of cues element. */
+    ctx->log(ctx, NESTEGG_LOG_DEBUG, "seek: parsing cue elements");
+    r = ne_parse(ctx, ne_cues_elements);
+    while (ctx->ancestor)
+      ne_ctx_pop(ctx);
+
+    /* Reset parser state to original state and seek back to old position. */
+    if (ne_ctx_restore(ctx, &state) != 0)
+      return -1;
+
+    if (r < 0)
+      return -1;
+  }
+
+  tc_scale = ne_get_timecode_scale(ctx);
+
+  cue_point = ne_find_cue_point_for_tstamp(ctx->segment.cues.cue_point.head, tc_scale, tstamp);
+  if (!cue_point)
+    return -1;
+
+  node = cue_point->cue_track_positions.head;
+
+  seek_pos = 0;
+
+  while (node) {
+    assert(node->id == ID_CUE_TRACK_POSITIONS);
+    pos = node->data;
+    if (ne_get_uint(pos->track, &t) == 0 && t - 1 == track) {
+      if (ne_get_uint(pos->cluster_position, &seek_pos) != 0)
+        return -1;
+      break;
+    }
+    node = node->next;
+  }
+
+  /* Seek and set up parser state for segment-level element (Cluster). */
+  r = ne_io_seek(ctx->io, ctx->segment_offset + seek_pos, NESTEGG_SEEK_SET);
+  if (r != 0)
+    return -1;
+  ctx->last_id = 0;
+  ctx->last_size = 0;
+
+  while (ctx->ancestor)
+    ne_ctx_pop(ctx);
+
+  ne_ctx_push(ctx, ne_top_level_elements, ctx);
+  ne_ctx_push(ctx, ne_segment_elements, &ctx->segment);
+  ctx->log(ctx, NESTEGG_LOG_DEBUG, "seek: parsing cluster elements");
+  r = ne_parse(ctx, NULL);
+  if (r != 1)
+    return -1;
+
+  if (!ne_is_suspend_element(ctx->last_id))
+    return -1;
+
+  return 0;
+}
+
+int
+nestegg_track_type(nestegg * ctx, unsigned int track)
+{
+  struct track_entry * entry;
+  uint64_t type;
+
+  entry = ne_find_track_entry(ctx, track);
+  if (!entry)
+    return -1;
+
+  if (ne_get_uint(entry->type, &type) != 0)
+    return -1;
+
+  if (type & TRACK_TYPE_VIDEO)
+    return NESTEGG_TRACK_VIDEO;
+
+  if (type & TRACK_TYPE_AUDIO)
+    return NESTEGG_TRACK_AUDIO;
+
+  return -1;
+}
+
+int
+nestegg_track_codec_id(nestegg * ctx, unsigned int track)
+{
+  char * codec_id;
+  struct track_entry * entry;
+
+  entry = ne_find_track_entry(ctx, track);
+  if (!entry)
+    return -1;
+
+  if (ne_get_string(entry->codec_id, &codec_id) != 0)
+    return -1;
+
+  if (strcmp(codec_id, TRACK_ID_VP8) == 0)
+    return NESTEGG_CODEC_VP8;
+
+  if (strcmp(codec_id, TRACK_ID_VORBIS) == 0)
+    return NESTEGG_CODEC_VORBIS;
+
+  return -1;
+}
+
+int
+nestegg_track_codec_data_count(nestegg * ctx, unsigned int track,
+                               unsigned int * count)
+{
+  struct track_entry * entry;
+  struct ebml_binary codec_private;
+  unsigned char * p;
+
+  *count = 0;
+
+  entry = ne_find_track_entry(ctx, track);
+  if (!entry)
+    return -1;
+
+  if (nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_VORBIS)
+    return -1;
+
+  if (ne_get_binary(entry->codec_private, &codec_private) != 0)
+    return -1;
+
+  if (codec_private.length < 1)
+    return -1;
+
+  p = codec_private.data;
+  *count = *p + 1;
+
+  if (*count > 3)
+    return -1;
+
+  return 0;
+}
+
+int
+nestegg_track_codec_data(nestegg * ctx, unsigned int track, unsigned int item,
+                         unsigned char ** data, size_t * length)
+{
+  struct track_entry * entry;
+  struct ebml_binary codec_private;
+  uint64_t sizes[3], total;
+  unsigned char * p;
+  unsigned int count, i;
+
+  *data = NULL;
+  *length = 0;
+
+  entry = ne_find_track_entry(ctx, track);
+  if (!entry)
+    return -1;
+
+  if (nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_VORBIS)
+    return -1;
+
+  if (ne_get_binary(entry->codec_private, &codec_private) != 0)
+    return -1;
+
+  p = codec_private.data;
+  count = *p++ + 1;
+
+  if (count > 3)
+    return -1;
+
+  i = 0;
+  total = 0;
+  while (--count) {
+    sizes[i] = ne_xiph_lace_value(&p);
+    total += sizes[i];
+    i += 1;
+  }
+  sizes[i] = codec_private.length - total - (p - codec_private.data);
+
+  for (i = 0; i < item; ++i) {
+    if (sizes[i] > LIMIT_FRAME)
+      return -1;
+    p += sizes[i];
+  }
+  *data = p;
+  *length = sizes[item];
+
+  return 0;
+}
+
+int
+nestegg_track_video_params(nestegg * ctx, unsigned int track,
+                           nestegg_video_params * params)
+{
+  struct track_entry * entry;
+  uint64_t value;
+
+  memset(params, 0, sizeof(*params));
+
+  entry = ne_find_track_entry(ctx, track);
+  if (!entry)
+    return -1;
+
+  if (nestegg_track_type(ctx, track) != NESTEGG_TRACK_VIDEO)
+    return -1;
+
+  if (ne_get_uint(entry->video.pixel_width, &value) != 0)
+    return -1;
+  params->width = value;
+
+  if (ne_get_uint(entry->video.pixel_height, &value) != 0)
+    return -1;
+  params->height = value;
+
+  value = 0;
+  ne_get_uint(entry->video.pixel_crop_bottom, &value);
+  params->crop_bottom = value;
+
+  value = 0;
+  ne_get_uint(entry->video.pixel_crop_top, &value);
+  params->crop_top = value;
+
+  value = 0;
+  ne_get_uint(entry->video.pixel_crop_left, &value);
+  params->crop_left = value;
+
+  value = 0;
+  ne_get_uint(entry->video.pixel_crop_right, &value);
+  params->crop_right = value;
+
+  value = params->width;
+  ne_get_uint(entry->video.display_width, &value);
+  params->display_width = value;
+
+  value = params->height;
+  ne_get_uint(entry->video.display_height, &value);
+  params->display_height = value;
+
+  return 0;
+}
+
+int
+nestegg_track_audio_params(nestegg * ctx, unsigned int track,
+                           nestegg_audio_params * params)
+{
+  struct track_entry * entry;
+  uint64_t value;
+
+  memset(params, 0, sizeof(*params));
+
+  entry = ne_find_track_entry(ctx, track);
+  if (!entry)
+    return -1;
+
+  if (nestegg_track_type(ctx, track) != NESTEGG_TRACK_AUDIO)
+    return -1;
+
+  params->rate = 8000;
+  ne_get_float(entry->audio.sampling_frequency, &params->rate);
+
+  value = 1;
+  ne_get_uint(entry->audio.channels, &value);
+  params->channels = value;
+
+  value = 16;
+  ne_get_uint(entry->audio.bit_depth, &value);
+  params->depth = value;
+
+  return 0;
+}
+
+int
+nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
+{
+  int r;
+  uint64_t id, size;
+
+  *pkt = NULL;
+
+  for (;;) {
+    r = ne_peek_element(ctx, &id, &size);
+    if (r != 1)
+      return r;
+
+    /* any suspend fields must be handled here */
+    if (ne_is_suspend_element(id)) {
+      r = ne_read_element(ctx, &id, &size);
+      if (r != 1)
+        return r;
+
+      /* the only suspend fields are blocks and simple blocks, which we
+         handle directly. */
+      r = ne_read_block(ctx, id, size, pkt);
+      return r;
+    }
+
+    r =  ne_parse(ctx, NULL);
+    if (r != 1)
+      return r;
+  }
+
+  return 1;
+}
+
+void
+nestegg_free_packet(nestegg_packet * pkt)
+{
+  struct frame * frame;
+
+  while (pkt->frame) {
+    frame = pkt->frame;
+    pkt->frame = frame->next;
+    free(frame->data);
+    free(frame);
+  }
+
+ free(pkt);
+}
+
+int
+nestegg_packet_track(nestegg_packet * pkt, unsigned int * track)
+{
+  *track = pkt->track;
+  return 0;
+}
+
+int
+nestegg_packet_tstamp(nestegg_packet * pkt, uint64_t * tstamp)
+{
+  *tstamp = pkt->timecode;
+  return 0;
+}
+
+int
+nestegg_packet_count(nestegg_packet * pkt, unsigned int * count)
+{
+  struct frame * f = pkt->frame;
+
+  *count = 0;
+
+  while (f) {
+    *count += 1;
+    f = f->next;
+  }
+
+  return 0;
+}
+
+int
+nestegg_packet_data(nestegg_packet * pkt, unsigned int item,
+                    unsigned char ** data, size_t * length)
+{
+  struct frame * f = pkt->frame;
+  unsigned int count = 0;
+
+  *data = NULL;
+  *length = 0;
+
+  while (f) {
+    if (count == item) {
+      *data = f->data;
+      *length = f->length;
+      return 0;
+    }
+    count += 1;
+    f = f->next;
+  }
+
+  return -1;
+}
diff --git a/nestegg/test/test.c b/nestegg/test/test.c
new file mode 100644
index 000000000..210b640c7
--- /dev/null
+++ b/nestegg/test/test.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright © 2010 Mozilla Foundation
+ *
+ * This program is made available under an ISC-style license.  See the
+ * accompanying file LICENSE for details.
+ */
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "nestegg/nestegg.h"
+
+#undef DEBUG
+#define SEEK_TEST
+
+static int
+stdio_read(void * p, size_t length, void * fp)
+{
+  size_t r;
+
+  r = fread(p, length, 1, fp);
+  if (r == 0 && feof(fp))
+    return 0;
+  return r == 0 ? -1 : 1;
+}
+
+static int
+stdio_seek(int64_t offset, int whence, void * fp)
+{
+  return fseek(fp, offset, whence);
+}
+
+static int64_t
+stdio_tell(void * fp)
+{
+  return ftell(fp);
+}
+
+static void
+log_callback(nestegg * ctx, unsigned int severity, char const * fmt, ...)
+{
+  va_list ap;
+  char const * sev = NULL;
+
+#ifndef DEBUG
+  if (severity < NESTEGG_LOG_WARNING)
+    return;
+#endif
+
+  switch (severity) {
+  case NESTEGG_LOG_DEBUG:
+    sev = "debug:   ";
+    break;
+  case NESTEGG_LOG_WARNING:
+    sev = "warning: ";
+    break;
+  case NESTEGG_LOG_CRITICAL:
+    sev = "critical:";
+    break;
+  default:
+    sev = "unknown: ";
+  }
+
+  fprintf(stderr, "%p %s ", (void *) ctx, sev);
+
+  va_start(ap, fmt);
+  vfprintf(stderr, fmt, ap);
+  va_end(ap);
+
+  fprintf(stderr, "\n");
+}
+
+int
+main(int argc, char * argv[])
+{
+  FILE * fp;
+  int r, type;
+  nestegg * ctx;
+  nestegg_audio_params aparams;
+  nestegg_packet * pkt;
+  nestegg_video_params vparams;
+  size_t length, size;
+  uint64_t duration, tstamp, pkt_tstamp;
+  unsigned char * codec_data, * ptr;
+  unsigned int cnt, i, j, track, tracks, pkt_cnt, pkt_track;
+  unsigned int data_items = 0;
+  nestegg_io io = {
+    stdio_read,
+    stdio_seek,
+    stdio_tell,
+    NULL
+  };
+
+  if (argc != 2)
+    return EXIT_FAILURE;
+
+  fp = fopen(argv[1], "rb");
+  if (!fp)
+    return EXIT_FAILURE;
+
+  io.userdata = fp;
+
+  ctx = NULL;
+  r = nestegg_init(&ctx, io, log_callback);
+  if (r != 0)
+    return EXIT_FAILURE;
+
+  nestegg_track_count(ctx, &tracks);
+  nestegg_duration(ctx, &duration);
+#ifdef DEBUG
+  fprintf(stderr, "media has %u tracks and duration %fs\n", tracks, duration / 1e9);
+#endif
+
+  for (i = 0; i < tracks; ++i) {
+    type = nestegg_track_type(ctx, i);
+#ifdef DEBUG
+    fprintf(stderr, "track %u: type: %d codec: %d", i,
+            type, nestegg_track_codec_id(ctx, i));
+#endif
+    nestegg_track_codec_data_count(ctx, i, &data_items);
+    for (j = 0; j < data_items; ++j) {
+      nestegg_track_codec_data(ctx, i, j, &codec_data, &length);
+#ifdef DEBUG
+      fprintf(stderr, " (%p, %u)", codec_data, (unsigned int) length);
+#endif
+    }
+    if (type == NESTEGG_TRACK_VIDEO) {
+      nestegg_track_video_params(ctx, i, &vparams);
+#ifdef DEBUG
+      fprintf(stderr, " video: %ux%u (d: %ux%u %ux%ux%ux%u)",
+              vparams.width, vparams.height,
+              vparams.display_width, vparams.display_height,
+              vparams.crop_top, vparams.crop_left, vparams.crop_bottom, vparams.crop_right);
+#endif
+    } else if (type == NESTEGG_TRACK_AUDIO) {
+      nestegg_track_audio_params(ctx, i, &aparams);
+#ifdef DEBUG
+      fprintf(stderr, " audio: %.2fhz %u bit %u channels",
+              aparams.rate, aparams.depth, aparams.channels);
+#endif
+    }
+#ifdef DEBUG
+    fprintf(stderr, "\n");
+#endif
+  }
+
+#ifdef SEEK_TEST
+#ifdef DEBUG
+  fprintf(stderr, "seek to middle\n");
+#endif
+  r = nestegg_track_seek(ctx, 0, duration / 2);
+  if (r == 0) {
+#ifdef DEBUG
+    fprintf(stderr, "middle ");
+#endif
+    r = nestegg_read_packet(ctx, &pkt);
+    if (r == 1) {
+      nestegg_packet_track(pkt, &track);
+      nestegg_packet_count(pkt, &cnt);
+      nestegg_packet_tstamp(pkt, &tstamp);
+#ifdef DEBUG
+      fprintf(stderr, "* t %u pts %f frames %u\n", track, tstamp / 1e9, cnt);
+#endif
+      nestegg_free_packet(pkt);
+    } else {
+#ifdef DEBUG
+      fprintf(stderr, "middle seek failed\n");
+#endif
+    }
+  }
+
+#ifdef DEBUG
+  fprintf(stderr, "seek to ~end\n");
+#endif
+  r = nestegg_track_seek(ctx, 0, duration - (duration / 10));
+  if (r == 0) {
+#ifdef DEBUG
+    fprintf(stderr, "end ");
+#endif
+    r = nestegg_read_packet(ctx, &pkt);
+    if (r == 1) {
+      nestegg_packet_track(pkt, &track);
+      nestegg_packet_count(pkt, &cnt);
+      nestegg_packet_tstamp(pkt, &tstamp);
+#ifdef DEBUG
+      fprintf(stderr, "* t %u pts %f frames %u\n", track, tstamp / 1e9, cnt);
+#endif
+      nestegg_free_packet(pkt);
+    } else {
+#ifdef DEBUG
+      fprintf(stderr, "end seek failed\n");
+#endif
+    }
+  }
+
+#ifdef DEBUG
+  fprintf(stderr, "seek to ~start\n");
+#endif
+  r = nestegg_track_seek(ctx, 0, duration / 10);
+  if (r == 0) {
+#ifdef DEBUG
+    fprintf(stderr, "start ");
+#endif
+    r = nestegg_read_packet(ctx, &pkt);
+    if (r == 1) {
+      nestegg_packet_track(pkt, &track);
+      nestegg_packet_count(pkt, &cnt);
+      nestegg_packet_tstamp(pkt, &tstamp);
+#ifdef DEBUG
+      fprintf(stderr, "* t %u pts %f frames %u\n", track, tstamp / 1e9, cnt);
+#endif
+      nestegg_free_packet(pkt);
+    } else {
+#ifdef DEBUG
+      fprintf(stderr, "start seek failed\n");
+#endif
+    }
+  }
+#endif
+
+  while (nestegg_read_packet(ctx, &pkt) > 0) {
+    nestegg_packet_track(pkt, &pkt_track);
+    nestegg_packet_count(pkt, &pkt_cnt);
+    nestegg_packet_tstamp(pkt, &pkt_tstamp);
+
+#ifdef DEBUG
+    fprintf(stderr, "t %u pts %f frames %u: ", pkt_track, pkt_tstamp / 1e9, pkt_cnt);
+#endif
+
+    for (i = 0; i < pkt_cnt; ++i) {
+      nestegg_packet_data(pkt, i, &ptr, &size);
+#ifdef DEBUG
+      fprintf(stderr, "%u ", (unsigned int) size);
+#endif
+    }
+#ifdef DEBUG
+    fprintf(stderr, "\n");
+#endif
+
+    nestegg_free_packet(pkt);
+  }
+
+  nestegg_destroy(ctx);
+  fclose(fp);
+
+  return EXIT_SUCCESS;
+}
diff --git a/release.sh b/release.sh
deleted file mode 100755
index 800bdf82f..000000000
--- a/release.sh
+++ /dev/null
@@ -1,210 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-
-
-self=$0
-
-for opt; do
-    case $opt in
-        --clean) clean=yes;;
-        -j*) jopt=$opt;;
-        *) echo "Unsupported option $opt"; exit 1;;
-    esac
-done
-
-TAB="$(printf '\t')"
-cat > release.mk << EOF
-%\$(BUILD_SFX).tar.bz2: %/.done
-${TAB}@echo "\$(subst .tar.bz2,,\$@): tarball"
-${TAB}@cd \$(dir \$<); tar -cf - \$(subst .tar.bz2,,\$@) | bzip2 > ../\$@
-
-%\$(BUILD_SFX).zip: %/.done
-${TAB}@echo "\$(subst .zip,,\$@): zip"
-${TAB}@rm -f \$@; cd \$(dir \$<); zip -rq ../\$@ \$(subst .zip,,\$@)
-
-logs/%\$(BUILD_SFX).log.bz2: %/.done
-${TAB}@echo "\$(subst .log.bz2,,\$(notdir \$@)): tarlog"
-${TAB}@mkdir -p logs
-${TAB}@cat \$< | bzip2 > \$@
-
-%/.done:
-${TAB}@mkdir -p \$(dir \$@)
-${TAB}@echo "\$(dir \$@): configure \$(CONFIG_OPTS) \$(EXTRA_PATH)"
-${TAB}@cd \$(dir \$@); export PATH=\$\$PATH\$(EXTRA_PATH); ../\$(SRC_ROOT)/configure \$(CONFIG_OPTS) >makelog.txt 2>&1
-${TAB}@echo "\$(dir \$@): make"
-${TAB}@cd \$(dir \$@); PATH=\$\$PATH\$(EXTRA_PATH) \$(MAKE) >>makelog.txt 2>&1
-${TAB}@echo "\$(dir \$@): test install"
-${TAB}@cd \$(dir \$@); PATH=\$\$PATH\$(EXTRA_PATH) \$(MAKE) install >>makelog.txt 2>&1
-${TAB}@cd \$(dir \$@)/dist/build; PATH=\$\$PATH\$(EXTRA_PATH) \$(MAKE) >>makelog.txt 2>&1
-${TAB}@echo "\$(dir \$@): install"
-${TAB}@cd \$(dir \$@); PATH=\$\$PATH\$(EXTRA_PATH) \$(MAKE) install DIST_DIR=\$(TGT) >>makelog.txt 2>&1
-${TAB}@touch \$@
-
-#include release-deps.mk
-EOF
-
-#[ -f release-deps.mk ] || \
-#    find ${self%/*} -name .git -prune -o -type f -print0 \
-#    | xargs -0 -n1 echo \
-#    | sed -e 's; ;\\ ;g' | awk '{print "$(TGT)/.done: "$0}' > release-deps.mk
-
-build_config_list() {
-    for codec in $CODEC_LIST; do
-        for arch in $ARCH_LIST; do
-            if [ -n "$OS_LIST" ]; then
-                for os in $OS_LIST; do
-                    CONFIGS="$CONFIGS vpx-${codec}-${arch}-${os}"
-                done
-            else
-                CONFIGS="$CONFIGS vpx-${codec}-${arch}"
-            fi
-        done
-    done
-}
-
-CODEC_LIST="vp8 vp8cx vp8dx"
-case `uname` in
-    Linux*)
-        ARCH_LIST="x86 x86_64"
-        OS_LIST="linux"
-        build_config_list
-        ARCH_LIST="armv5te armv6 armv7"
-        OS_LIST="linux-gcc"
-
-        ;;
-    CYGWIN*)
-        TAR_SFX=.zip
-        for vs in vs7 vs8; do
-            for arch in x86-win32 x86_64-win64; do
-                for msvcrt in md mt; do
-                    case $vs,$arch in
-                        vs7,x86_64-win64) continue ;;
-                    esac
-                    ARCH_LIST="$ARCH_LIST ${arch}${msvcrt}-${vs}"
-                done
-            done
-        done
-        ;;
-    Darwin*)
-        ARCH_LIST="universal"
-        OS_LIST="darwin8 darwin9"
-        ;;
-    sun_os*)
-        ARCH_LIST="x86 x86_64"
-        OS_LIST="solaris"
-        ;;
-esac
-build_config_list
-
-TAR_SFX=${TAR_SFX:-.tar.bz2}
-ARM_TOOLCHAIN=/usr/local/google/csl-2009q3-67
-for cfg in $CONFIGS; do
-    full_cfg=$cfg
-    cfg=${cfg#vpx-}
-    opts=
-    rm -f makelog.txt
-
-    case $cfg in
-        src-*)  opts="$opts --enable-codec-srcs"
-                cfg=${cfg#src-}
-                ;;
-        eval-*) opts="$opts --enable-eval-limit"
-                cfg=${cfg#src-}
-                ;;
-    esac
-
-    case $cfg in
-        #
-        # Linux
-        #
-        *x86-linux)
-            opts="$opts --target=x86-linux-gcc" ;;
-        *x86_64-linux)
-            opts="$opts --target=x86_64-linux-gcc" ;;
-        *arm*-linux-gcc)
-            armv=${cfg##*armv}
-            armv=${armv%%-*}
-            opts="$opts --target=armv${armv}-linux-gcc" ;;
-        *arm*-linux-rvct)
-            armv=${cfg##*armv}
-            armv=${armv%%-*}
-            opts="$opts --target=armv${armv}-linux-rvct"
-            opts="$opts --libc=${ARM_TOOLCHAIN}/arm-none-linux-gnueabi/libc" ;;
-
-
-        #
-        # Windows
-        #
-        # need --enable-debug-libs for now until we're smarter about
-        # building the debug/release from the customer installed
-        # environment
-        *-x86-win32*-vs*)
-            opts="$opts --target=x86-win32-vs${cfg##*-vs} --enable-debug-libs";;
-        *-x86_64-win64*-vs8)
-            opts="$opts --target=x86_64-win64-vs8 --enable-debug-libs" ;;
-
-        #
-        # Darwin
-        #
-        *-universal-darwin*)
-            opts="$opts --target=universal-darwin${cfg##*-darwin}-gcc" ;;
-
-        #
-        # Solaris
-        #
-        *x86-solaris)
-            opts="$opts --target=x86-solaris-gcc" ;;
-        *x86_64-solaris)
-            opts="$opts --target=x86_64-solaris-gcc" ;;
-    esac
-
-    case $cfg in
-        *x86-linux | *x86-solaris) opts="$opts --enable-pic" ;;
-    esac
-
-    case $cfg in
-        *-win[36][24]mt*)  opts="$opts --enable-static-msvcrt" ;;
-        *-win[36][24]md*)  opts="$opts --disable-static-msvcrt" ;;
-    esac
-
-    opts="$opts --disable-codecs"
-    case $cfg in
-        vp8*) opts="$opts --enable-vp8" ;;
-    esac
-    case $cfg in
-        *cx-*) opts="${opts}-encoder" ;;
-        *dx-*) opts="${opts}-decoder" ;;
-    esac
-    opts="$opts --enable-postproc"
-
-    [ "x${clean}" = "xyes" ] \
-        && rm -rf ${full_cfg}${BUILD_SFX}${TAR_SFX} \
-        && rm -rf logs/${full_cfg}${BUILD_SFX}.log.bz2
-
-    TGT=${full_cfg}${BUILD_SFX}
-    BUILD_TARGETS="logs/${TGT}.log.bz2 ${TGT}${TAR_SFX}"
-    echo "${BUILD_TARGETS}: CONFIG_OPTS=$opts" >>release.mk
-    echo "${BUILD_TARGETS}: TGT=${TGT}" >>release.mk
-    case $cfg in
-        *-arm*-linux-*)
-            echo "${BUILD_TARGETS}: EXTRA_PATH=:${ARM_TOOLCHAIN}/bin/" >>release.mk ;;
-        *-vs7)
-            echo "${BUILD_TARGETS}: EXTRA_PATH=:/cygdrive/c/Program\ Files/Microsoft\ Visual\ Studio\ .NET\ 2003/Common7/IDE" >>release.mk ;;
-        *-vs8)
-            echo "${BUILD_TARGETS}: EXTRA_PATH=:/cygdrive/c/Program\ Files/Microsoft\ Visual\ Studio\ 8/Common7/IDE" >>release.mk ;;
-    esac
-    MAKE_TGTS="$MAKE_TGTS ${TGT}${TAR_SFX} logs/${TGT}.log.bz2"
-done
-
-
-${MAKE:-make} ${jopt:--j3} -f release.mk  \
-    SRC_ROOT=${self%/*} BUILD_SFX=${BUILD_SFX} ${MAKE_TGTS}
diff --git a/solution.mk b/solution.mk
index 8e852ec5d..6d2c08d06 100644
--- a/solution.mk
+++ b/solution.mk
@@ -22,7 +22,7 @@ else
 vpx.sln: $(wildcard *.vcproj)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
-            $(if $(filter %vpx.vcproj,$^),--dep=ivfdec:vpx) \
+            $(if $(filter %vpx.vcproj,$^),--dep=vpxdec:vpx) \
             $(if $(filter %vpx.vcproj,$^),--dep=xma:vpx) \
             --ver=$(CONFIG_VS_VERSION)\
             --target=$(TOOLCHAIN)\
diff --git a/tools_common.c b/tools_common.c
new file mode 100644
index 000000000..d188bbe20
--- /dev/null
+++ b/tools_common.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdio.h>
+#include "tools_common.h"
+#ifdef _WIN32
+#include <io.h>
+#include <fcntl.h>
+#endif
+
+FILE* set_binary_mode(FILE *stream)
+{
+    (void)stream;
+#ifdef _WIN32
+    _setmode(_fileno(stream), _O_BINARY);
+#endif
+    return stream;
+}
diff --git a/tools_common.h b/tools_common.h
new file mode 100644
index 000000000..80c974732
--- /dev/null
+++ b/tools_common.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TOOLS_COMMON_H
+#define TOOLS_COMMON_H
+
+/* Sets a stdio stream into binary mode */
+FILE* set_binary_mode(FILE *stream);
+
+#endif
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index 408c25306..9dce8c8f6 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -56,7 +56,7 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
 
     vp8_de_alloc_frame_buffers(oci);
 
-    // our internal buffers are always multiples of 16
+    /* our internal buffers are always multiples of 16 */
     if ((width & 0xf) != 0)
         width += 16 - (width & 0xf);
 
@@ -153,7 +153,7 @@ void vp8_setup_version(VP8_COMMON *cm)
         cm->full_pixel = 1;
         break;
     default:
-        //4,5,6,7 are reserved for future use
+        /*4,5,6,7 are reserved for future use*/
         cm->no_lpf = 0;
         cm->simpler_lpf = 0;
         cm->use_bilinear_mc_filter = 0;
@@ -177,10 +177,10 @@ void vp8_create_common(VP8_COMMON *oci)
     oci->clr_type = REG_YUV;
     oci->clamp_type = RECON_CLAMP_REQUIRED;
 
-    // Initialise reference frame sign bias structure to defaults
+    /* Initialise reference frame sign bias structure to defaults */
     vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
 
-    // Default disable buffer to buffer copying
+    /* Default disable buffer to buffer copying */
     oci->copy_buffer_to_gf = 0;
     oci->copy_buffer_to_arf = 0;
 }
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
new file mode 100644
index 000000000..83921f807
--- /dev/null
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -0,0 +1,136 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "g_common.h"
+#include "pragmas.h"
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
+
+extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
+extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
+
+void vp8_arch_arm_common_init(VP8_COMMON *ctx)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
+    int flags = arm_cpu_caps();
+    int has_edsp = flags & HAS_EDSP;
+    int has_media = flags & HAS_MEDIA;
+    int has_neon = flags & HAS_NEON;
+    rtcd->flags = flags;
+
+    /* Override default functions with fastest ones for this CPU. */
+#if HAVE_ARMV6
+    if (has_media)
+    {
+        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_armv6;
+        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_armv6;
+        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_armv6;
+        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_armv6;
+        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6;
+        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_armv6;
+        rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_armv6;
+        rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;
+
+        rtcd->idct.idct1        = vp8_short_idct4x4llm_1_v6;
+        rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;
+        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_v6;
+        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_v6;
+
+        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
+        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;
+        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
+        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;
+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
+        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;
+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
+        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;
+
+        rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;
+        rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
+        rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
+        rtcd->recon.recon       = vp8_recon_b_armv6;
+        rtcd->recon.recon2      = vp8_recon2b_armv6;
+        rtcd->recon.recon4      = vp8_recon4b_armv6;
+    }
+#endif
+
+#if HAVE_ARMV7
+    if (has_neon)
+    {
+        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_neon;
+        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_neon;
+        rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_neon;
+        rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_neon;
+        rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon;
+        rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_neon;
+        rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_neon;
+        rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;
+
+        rtcd->idct.idct1        = vp8_short_idct4x4llm_1_neon;
+        rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;
+        rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;
+        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;
+
+        rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
+        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_neon;
+        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon;
+        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_neon;
+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon;
+        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_neon;
+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon;
+        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_neon;
+
+        rtcd->recon.copy16x16   = vp8_copy_mem16x16_neon;
+        rtcd->recon.copy8x8     = vp8_copy_mem8x8_neon;
+        rtcd->recon.copy8x4     = vp8_copy_mem8x4_neon;
+        rtcd->recon.recon       = vp8_recon_b_neon;
+        rtcd->recon.recon2      = vp8_recon2b_neon;
+        rtcd->recon.recon4      = vp8_recon4b_neon;
+        rtcd->recon.recon_mb    = vp8_recon_mb_neon;
+
+    }
+#endif
+
+#endif
+
+#if HAVE_ARMV6
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (has_media)
+#endif
+    {
+        vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
+        vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
+    }
+#endif
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (has_neon)
+#endif
+    {
+        vp8_build_intra_predictors_mby_ptr =
+         vp8_build_intra_predictors_mby_neon;
+        vp8_build_intra_predictors_mby_s_ptr =
+         vp8_build_intra_predictors_mby_s_neon;
+    }
+#endif
+}
diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm
index 8bc6d7735..03b5bccd7 100644
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ b/vp8/common/arm/armv6/filter_v6.asm
@@ -11,6 +11,7 @@
 
     EXPORT  |vp8_filter_block2d_first_pass_armv6|
     EXPORT  |vp8_filter_block2d_second_pass_armv6|
+    EXPORT  |vp8_filter4_block2d_second_pass_armv6|
     EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
     EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
 
@@ -192,6 +193,64 @@
 
     ENDP
 
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter4_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp8_filter address
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    add     lr, r1, r3                      ; save final destination pointer
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+    mov     r4, #0x40                       ; rounding factor (for smlad{x})
+
+|height_loop_2nd_4|
+    ldrd    r8, [r0, #-4]                   ; load the data
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd_4|
+    ldr     r10, [r0, #4]!
+    smladx  r6, r9, r12, r4                 ; apply filter
+    pkhbt   r8, r9, r8
+    smlad   r5, r8, r12, r4
+    pkhbt   r8, r10, r9
+    smladx  r6, r10, r11, r6
+    sub     r7, r7, #1
+    smlad   r5, r8, r11, r5
+
+    mov     r8, r9                          ; shift the data for the next loop
+    mov     r9, r10
+
+    usat    r6, #8, r6, asr #7              ; shift and clamp
+    usat    r5, #8, r5, asr #7
+
+    strb    r5, [r1], r2                    ; the result is transposed back and stored
+    tst     r7, #0xff
+    strb    r6, [r1], r2
+
+    bne     width_loop_2nd_4
+
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; update src for next loop
+    sub     r1, lr, r7, lsr #16             ; update dst for next loop
+
+    bne     height_loop_2nd_4
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
 ;------------------------------------
 ; r0    unsigned char *src_ptr
 ; r1    unsigned char *output_ptr,
diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
index 8fb80ef29..8b9939484 100644
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -25,10 +25,10 @@
 ;and the result is stored in transpose.
 |vp8_sixtap_predict8x4_armv6| PROC
     stmdb       sp!, {r4 - r11, lr}
-    sub         sp, sp, #184                ;reserve space on stack for temporary storage: 20x(8+1) +4
+    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
 
     cmp         r2, #0                      ;skip first_pass filter if xoffset=0
-    str         r3, [sp], #4                ;store yoffset
+    add         lr, sp, #4                  ;point to temporary buffer
     beq         skip_firstpass_filter
 
 ;first-pass filter
@@ -45,7 +45,6 @@
     mov         r2, #0x90000                ; height=9 is top part of counter
 
     sub         r1, r1, #8
-    mov         lr, #20
 
 |first_pass_hloop_v6|
     ldrb        r6, [r0, #-5]               ; load source data
@@ -83,10 +82,10 @@
     tst         r2, #0xff                   ; test loop counter
     usat        r11, #8, r11, asr #7
     add         r12, r12, #0x40
-    strh        r11, [sp], lr               ; result is transposed and stored, which
+    strh        r11, [lr], #20              ; result is transposed and stored, which
     usat        r12, #8, r12, asr #7
 
-    strh        r12, [sp], lr
+    strh        r12, [lr], #20
 
     movne       r11, r6
     movne       r12, r7
@@ -107,8 +106,7 @@
 
     subs        r2, r2, #0x10000
 
-    mov         r6, #158
-    sub         sp, sp, r6
+    sub         lr, lr, #158
 
     add         r0, r0, r1                  ; move to next input line
 
@@ -116,10 +114,7 @@
 
 ;second pass filter
 secondpass_filter
-    mov         r1, #18
-    sub         sp, sp, r1                  ; 18+4
-
-    ldr         r3, [sp, #-4]               ; load back yoffset
+    ldr         r3, [sp], #4                ; load back yoffset
     ldr         r0, [sp, #216]              ; load dst address from stack 180+36
     ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
 
@@ -192,30 +187,28 @@ skip_firstpass_filter
     sub         r0, r0, r1, lsl #1
     sub         r1, r1, #8
     mov         r2, #9
-    mov         r3, #20
 
 skip_firstpass_hloop
     ldrb        r4, [r0], #1                ; load data
     subs        r2, r2, #1
     ldrb        r5, [r0], #1
-    strh        r4, [sp], r3                ; store it to immediate buffer
+    strh        r4, [lr], #20               ; store it to immediate buffer
     ldrb        r6, [r0], #1                ; load data
-    strh        r5, [sp], r3
+    strh        r5, [lr], #20
     ldrb        r7, [r0], #1
-    strh        r6, [sp], r3
+    strh        r6, [lr], #20
     ldrb        r8, [r0], #1
-    strh        r7, [sp], r3
+    strh        r7, [lr], #20
     ldrb        r9, [r0], #1
-    strh        r8, [sp], r3
+    strh        r8, [lr], #20
     ldrb        r10, [r0], #1
-    strh        r9, [sp], r3
+    strh        r9, [lr], #20
     ldrb        r11, [r0], #1
-    strh        r10, [sp], r3
+    strh        r10, [lr], #20
     add         r0, r0, r1                  ; move to next input line
-    strh        r11, [sp], r3
+    strh        r11, [lr], #20
 
-    mov         r4, #158
-    sub         sp, sp, r4                  ; move over to next column
+    sub         lr, lr, #158                ; move over to next column
     bne         skip_firstpass_hloop
 
     b           secondpass_filter
diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c
index 247f95b6c..65afb41a1 100644
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -49,7 +49,7 @@ extern void vp8_filter_block2d_bil_second_pass_armv6
     const short *vp8_filter
 );
 
-/*
+#if 0
 void vp8_filter_block2d_bil_first_pass_6
 (
     unsigned char *src_ptr,
@@ -66,14 +66,14 @@ void vp8_filter_block2d_bil_first_pass_6
     {
         for ( j=0; j<output_width; j++ )
         {
-            // Apply bilinear filter
+            /* Apply bilinear filter */
             output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
                                ((int)src_ptr[1] * vp8_filter[1]) +
                                 (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
             src_ptr++;
         }
 
-        // Next row...
+        /* Next row... */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_width;
     }
@@ -96,7 +96,7 @@ void vp8_filter_block2d_bil_second_pass_6
     {
         for ( j=0; j<output_width; j++ )
         {
-            // Apply filter
+            /* Apply filter */
             Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
                     ((int)src_ptr[output_width] * vp8_filter[1]) +
                     (VP8_FILTER_WEIGHT/2);
@@ -104,12 +104,12 @@ void vp8_filter_block2d_bil_second_pass_6
             src_ptr++;
         }
 
-        // Next row...
-        //src_ptr    += src_pixels_per_line - output_width;
+        /* Next row... */
+        /*src_ptr    += src_pixels_per_line - output_width;*/
         output_ptr += output_pitch;
     }
 }
-*/
+#endif
 
 void vp8_filter_block2d_bil_armv6
 (
@@ -124,13 +124,13 @@ void vp8_filter_block2d_bil_armv6
 )
 {
 
-    unsigned short FData[36*16]; // Temp data bufffer used in filtering
+    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
 
-    // First filter 1-D horizontally...
-    // pixel_step = 1;
+    /* First filter 1-D horizontally... */
+    /* pixel_step = 1; */
     vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
 
-    // then 1-D vertically...
+    /* then 1-D vertically... */
     vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
 }
 
diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c
index 5ed4f8094..b4f2fe6ca 100644
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -20,13 +20,13 @@
 
 DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
 {
-    { 0,  0,  128,    0,   0,  0 },         // note that 1/8 pel positions are just as per alpha -0.5 bicubic
+    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
     { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         // New 1/4 pel 6 tap filter
+    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
     { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         // New 1/2 pel 6 tap filter
+    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
     { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         // New 1/4 pel 6 tap filter
+    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
     { 0, -1,   12,  123,  -6,  0 },
 };
 
@@ -50,6 +50,15 @@ extern void vp8_filter_block2d_second_pass_armv6
     const short *vp8_filter
 );
 
+extern void vp8_filter4_block2d_second_pass_armv6
+(
+    short         *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int output_pitch,
+    unsigned int cnt,
+    const short *vp8_filter
+);
+
 extern void vp8_filter_block2d_first_pass_only_armv6
 (
     unsigned char *src_ptr,
@@ -84,39 +93,43 @@ void vp8_sixtap_predict_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
 
 
-    HFilter = sub_pel_filters[xoffset];   // 6 tap
-    VFilter = sub_pel_filters[yoffset];       // 6 tap
+    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
 
-    // Vfilter is null. First pass only
+    /* Vfilter is null. First pass only */
     if (xoffset && !yoffset)
     {
-        //vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
-        //vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );
+        /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
+        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
 
         vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
     }
-    // Hfilter is null. Second pass only
+    /* Hfilter is null. Second pass only */
     else if (!xoffset && yoffset)
     {
         vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
     }
     else
     {
-        // Vfilter is a 4 tap filter
+        /* Vfilter is a 4 tap filter */
         if (yoffset & 0x1)
+        {
             vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
-        // Vfilter is 6 tap filter
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+        }
+        /* Vfilter is 6 tap filter */
         else
+        {
             vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
-
-        vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+        }
     }
 }
 
-/*
+#if 0
 void vp8_sixtap_predict8x4_armv6
 (
     unsigned char  *src_ptr,
@@ -129,33 +142,33 @@ void vp8_sixtap_predict8x4_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); // Temp data bufffer used in filtering
-
-    HFilter = sub_pel_filters[xoffset];   // 6 tap
-    VFilter = sub_pel_filters[yoffset];       // 6 tap
-
-
-//  if (xoffset && !yoffset)
-//  {
-//      vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
-//  }
-    // Hfilter is null. Second pass only
-//  else if (!xoffset && yoffset)
-//  {
-//      vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
-//  }
-//  else
-//  {
-//      if (yoffset & 0x1)
-    //      vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
-    //  else
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
+
+    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+
+
+    /*if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
+    }*/
+    /* Hfilter is null. Second pass only */
+    /*else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
+    }
+    else
+    {
+        if (yoffset & 0x1)
+            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
+        else*/
 
         vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
 
         vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
-//  }
+    /*}*/
 }
-*/
+#endif
 
 void vp8_sixtap_predict8x8_armv6
 (
@@ -169,16 +182,16 @@ void vp8_sixtap_predict8x8_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   // 6 tap
-    VFilter = sub_pel_filters[yoffset];       // 6 tap
+    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
 
     if (xoffset && !yoffset)
     {
         vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
     }
-    // Hfilter is null. Second pass only
+    /* Hfilter is null. Second pass only */
     else if (!xoffset && yoffset)
     {
         vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
@@ -186,11 +199,15 @@ void vp8_sixtap_predict8x8_armv6
     else
     {
         if (yoffset & 0x1)
+        {
             vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+        }
         else
+        {
             vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
-
-        vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+        }
     }
 }
 
@@ -207,16 +224,16 @@ void vp8_sixtap_predict16x16_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   // 6 tap
-    VFilter = sub_pel_filters[yoffset];       // 6 tap
+    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
 
     if (xoffset && !yoffset)
     {
         vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
     }
-    // Hfilter is null. Second pass only
+    /* Hfilter is null. Second pass only */
     else if (!xoffset && yoffset)
     {
         vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
@@ -224,11 +241,15 @@ void vp8_sixtap_predict16x16_armv6
     else
     {
         if (yoffset & 0x1)
+        {
             vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+        }
         else
+        {
             vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
-
-        vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+        }
     }
 
 }
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
index f28d7f649..8b8d17917 100644
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -19,6 +19,7 @@ extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
 extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
 extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_idct_idct1
 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
 
@@ -34,6 +35,7 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
 #undef  vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
 #endif
+#endif
 
 #if HAVE_ARMV7
 extern prototype_idct(vp8_short_idct4x4llm_1_neon);
@@ -42,6 +44,7 @@ extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
 extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
 extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_idct_idct1
 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon
 
@@ -57,5 +60,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
 #undef  vp8_idct_iwalsh16
 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
 #endif
+#endif
 
 #endif
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index f86bca1ea..a81c50588 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -35,8 +35,8 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
 
 
 #if HAVE_ARMV6
-//ARMV6 loopfilter functions
-// Horizontal MB filtering
+/*ARMV6 loopfilter functions*/
+/* Horizontal MB filtering */
 void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -60,7 +60,7 @@ void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
 }
 
-// Vertical MB Filtering
+/* Vertical MB Filtering */
 void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -84,7 +84,7 @@ void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsi
     vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
 }
 
-// Horizontal B Filtering
+/* Horizontal B Filtering */
 void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -112,7 +112,7 @@ void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
     vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
 }
 
-// Vertical B Filtering
+/* Vertical B Filtering */
 void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -142,8 +142,8 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
 #endif
 
 #if HAVE_ARMV7
-// NEON loopfilter functions
-// Horizontal MB filtering
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -164,7 +164,7 @@ void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsig
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
 }
 
-// Vertical MB Filtering
+/* Vertical MB Filtering */
 void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -185,7 +185,7 @@ void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsig
     vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
 }
 
-// Horizontal B Filtering
+/* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -210,7 +210,7 @@ void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign
     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
 }
 
-// Vertical B Filtering
+/* Vertical B Filtering */
 void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h
index 6c3628ae9..cd62207d7 100644
--- a/vp8/common/arm/loopfilter_arm.h
+++ b/vp8/common/arm/loopfilter_arm.h
@@ -22,6 +22,7 @@ extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
 #define vp8_lf_normal_mb_v vp8_loop_filter_mbv_armv6
 
@@ -46,6 +47,7 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
 #endif
+#endif
 
 #if HAVE_ARMV7
 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
@@ -57,6 +59,7 @@ extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
 #define vp8_lf_normal_mb_v vp8_loop_filter_mbv_neon
 
@@ -81,5 +84,6 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
 #endif
+#endif
 
 #endif
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
new file mode 100644
index 000000000..bf0c35721
--- /dev/null
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -0,0 +1,409 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
+    EXPORT  |vp8_loop_filter_horizontal_edge_uv_neon|
+    EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
+    EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; flimit, limit, and thresh should be positive numbers.
+; All 16 elements in these variables are equal.
+
+; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
+;                                             const signed char *flimit,
+;                                             const signed char *limit,
+;                                             const signed char *thresh,
+;                                             int count)
+; r0    unsigned char *src
+; r1    int pitch
+; r2    const signed char *flimit
+; r3    const signed char *limit
+; sp    const signed char *thresh,
+; sp+4  int count (unused)
+|vp8_loop_filter_horizontal_edge_y_neon| PROC
+    stmdb       sp!, {lr}
+    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    sub         r2, r0, r1, lsl #2          ; move src pointer down by 4 lines
+    ldr         r12, [sp, #4]               ; load thresh pointer
+
+    vld1.u8     {q3}, [r2], r1              ; p3
+    vld1.u8     {q4}, [r2], r1              ; p2
+    vld1.u8     {q5}, [r2], r1              ; p1
+    vld1.u8     {q6}, [r2], r1              ; p0
+    vld1.u8     {q7}, [r2], r1              ; q0
+    vld1.u8     {q8}, [r2], r1              ; q1
+    vld1.u8     {q9}, [r2], r1              ; q2
+    vld1.u8     {q10}, [r2]                 ; q3
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    sub         r0, r0, r1, lsl #1
+
+    bl          vp8_loop_filter_neon
+
+    vst1.u8     {q5}, [r0], r1              ; store op1
+    vst1.u8     {q6}, [r0], r1              ; store op0
+    vst1.u8     {q7}, [r0], r1              ; store oq0
+    vst1.u8     {q8}, [r0], r1              ; store oq1
+
+    ldmia       sp!, {pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch
+;                                              const signed char *flimit,
+;                                              const signed char *limit,
+;                                              const signed char *thresh,
+;                                              unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_loop_filter_horizontal_edge_uv_neon| PROC
+    stmdb       sp!, {lr}
+    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    ldr         r2, [sp, #8]                ; load v ptr
+
+    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
+    vld1.u8     {d6}, [r3], r1              ; p3
+    vld1.u8     {d8}, [r3], r1              ; p2
+    vld1.u8     {d10}, [r3], r1             ; p1
+    vld1.u8     {d12}, [r3], r1             ; p0
+    vld1.u8     {d14}, [r3], r1             ; q0
+    vld1.u8     {d16}, [r3], r1             ; q1
+    vld1.u8     {d18}, [r3], r1             ; q2
+    vld1.u8     {d20}, [r3]                 ; q3
+
+    ldr         r3, [sp, #4]                ; load thresh pointer
+
+    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
+    vld1.u8     {d7}, [r12], r1             ; p3
+    vld1.u8     {d9}, [r12], r1             ; p2
+    vld1.u8     {d11}, [r12], r1            ; p1
+    vld1.u8     {d13}, [r12], r1            ; p0
+    vld1.u8     {d15}, [r12], r1            ; q0
+    vld1.u8     {d17}, [r12], r1            ; q1
+    vld1.u8     {d19}, [r12], r1            ; q2
+    vld1.u8     {d21}, [r12]                ; q3
+
+    vld1.s8     {d4[], d5[]}, [r3]          ; thresh
+
+    bl          vp8_loop_filter_neon
+
+    sub         r0, r0, r1, lsl #1
+    sub         r2, r2, r1, lsl #1
+
+    vst1.u8     {d10}, [r0], r1             ; store u op1
+    vst1.u8     {d11}, [r2], r1             ; store v op1
+    vst1.u8     {d12}, [r0], r1             ; store u op0
+    vst1.u8     {d13}, [r2], r1             ; store v op0
+    vst1.u8     {d14}, [r0], r1             ; store u oq0
+    vst1.u8     {d15}, [r2], r1             ; store v oq0
+    vst1.u8     {d16}, [r0]                 ; store u oq1
+    vst1.u8     {d17}, [r2]                 ; store v oq1
+
+    ldmia       sp!, {pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
+
+; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+;                                           const signed char *flimit,
+;                                           const signed char *limit,
+;                                           const signed char *thresh,
+;                                           int count)
+; r0    unsigned char *src,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  int count (unused)
+|vp8_loop_filter_vertical_edge_y_neon| PROC
+    stmdb       sp!, {lr}
+    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    sub         r2, r0, #4                  ; src ptr down by 4 columns
+    sub         r0, r0, #2                  ; dst ptr
+    ldr         r12, [sp, #4]               ; load thresh pointer
+
+    vld1.u8     {d6}, [r2], r1              ; load first 8-line src data
+    vld1.u8     {d8}, [r2], r1
+    vld1.u8     {d10}, [r2], r1
+    vld1.u8     {d12}, [r2], r1
+    vld1.u8     {d14}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d18}, [r2], r1
+    vld1.u8     {d20}, [r2], r1
+
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+
+    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
+    vld1.u8     {d9}, [r2], r1
+    vld1.u8     {d11}, [r2], r1
+    vld1.u8     {d13}, [r2], r1
+    vld1.u8     {d15}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d19}, [r2], r1
+    vld1.u8     {d21}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    bl          vp8_loop_filter_neon
+
+    vswp        d12, d11
+    vswp        d16, d13
+    vswp        d14, d12
+    vswp        d16, d15
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r0], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r0], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0], r1
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r0]
+
+    ldmia       sp!, {pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
+
+; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
+;                                            const signed char *flimit,
+;                                            const signed char *limit,
+;                                            const signed char *thresh,
+;                                            unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_loop_filter_vertical_edge_uv_neon| PROC
+    stmdb       sp!, {lr}
+    sub         r12, r0, #4                  ; move u pointer down by 4 columns
+    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+
+    ldr         r2, [sp, #8]                ; load v ptr
+
+    vld1.u8     {d6}, [r12], r1              ;load u data
+    vld1.u8     {d8}, [r12], r1
+    vld1.u8     {d10}, [r12], r1
+    vld1.u8     {d12}, [r12], r1
+    vld1.u8     {d14}, [r12], r1
+    vld1.u8     {d16}, [r12], r1
+    vld1.u8     {d18}, [r12], r1
+    vld1.u8     {d20}, [r12]
+
+    sub         r3, r2, #4                  ; move v pointer down by 4 columns
+    vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d21}, [r3]
+
+    ldr         r12, [sp, #4]               ; load thresh pointer
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+
+    bl          vp8_loop_filter_neon
+
+    sub         r0, r0, #2
+    sub         r2, r2, #2
+
+    vswp        d12, d11
+    vswp        d16, d13
+    vswp        d14, d12
+    vswp        d16, d15
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
+
+    ldmia       sp!, {pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
+
+; void vp8_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+
+; r0-r3 PRESERVE
+; q0    flimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+|vp8_loop_filter_neon| PROC
+    ldr         r12, _lf_coeff_
+
+    ; vp8_filter_mask
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vmax.u8     q11, q11, q12
+    vmax.u8     q12, q13, q14
+    vmax.u8     q3, q3, q4
+    vmax.u8     q15, q11, q12
+
+    ; vp8_hevmask
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3
+
+    vadd.u8     q0, q0, q0                  ; flimit * 2
+    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
+    vcge.u8     q15, q1, q15
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
+
+    vld1.u8     {q0}, [r12]!
+
+    ; vp8_filter() function
+    ; convert to signed
+    veor        q7, q7, q0                  ; qs0
+    veor        q6, q6, q0                  ; ps0
+    veor        q5, q5, q0                  ; ps1
+    veor        q8, q8, q0                  ; qs1
+
+    vld1.u8     {q10}, [r12]!
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vmovl.u8    q4, d20
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; vp8_hevmask
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; vp8_filter &= hev
+    vand        q15, q15, q9                ; vp8_filter_mask
+
+    vaddw.s8    q2, q2, d2
+    vaddw.s8    q11, q11, d3
+
+    vld1.u8     {q9}, [r12]!
+
+    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp8_filter+3)
+    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
+
+    ; outer tap adjustments: ++vp8_filter >> 1
+    vrshr.s8    q1, q1, #1
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp8_filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp8_filter)
+
+    veor        q5, q13, q0                 ; *op1 = u^0x80
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q8, q12, q0                 ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+    AREA    loopfilter_dat, DATA, READONLY
+_lf_coeff_
+    DCD     lf_coeff
+lf_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
+
+    END
diff --git a/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm b/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm
deleted file mode 100644
index c0c3e337c..000000000
--- a/vp8/common/arm/neon/loopfilterhorizontaledge_uv_neon.asm
+++ /dev/null
@@ -1,206 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_loop_filter_horizontal_edge_uv_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *u,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; stack(r5) unsigned char *v
-
-|vp8_loop_filter_horizontal_edge_uv_neon| PROC
-    sub         r0, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-
-    ldr         r2, [sp, #4]                ; load v ptr
-    ldr         r12, [sp, #0]               ; load thresh pointer
-
-    sub         r2, r2, r1, lsl #2          ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r0], r1              ; p3
-    vld1.u8     {d7}, [r2], r1              ; p3
-    vld1.u8     {d8}, [r0], r1              ; p2
-    vld1.u8     {d9}, [r2], r1              ; p2
-    vld1.u8     {d10}, [r0], r1             ; p1
-    vld1.u8     {d11}, [r2], r1             ; p1
-    vld1.u8     {d12}, [r0], r1             ; p0
-    vld1.u8     {d13}, [r2], r1             ; p0
-    vld1.u8     {d14}, [r0], r1             ; q0
-    vld1.u8     {d15}, [r2], r1             ; q0
-    vld1.u8     {d16}, [r0], r1             ; q1
-    vld1.u8     {d17}, [r2], r1             ; q1
-    vld1.u8     {d18}, [r0], r1             ; q2
-    vld1.u8     {d19}, [r2], r1             ; q2
-    vld1.u8     {d20}, [r0], r1             ; q3
-    vld1.u8     {d21}, [r2], r1             ; q3
-
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-
-    ldr         r12, _lfhuv_coeff_
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
-    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
-    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
-    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-
-    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
-    vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
-    vadd.u8     q0, q0, q0                  ; flimit * 2
-    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
-
-    vand        q15, q15, q12
-    vand        q10, q10, q11
-    vand        q3, q3, q4
-
-    vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
-    vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
-    vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-    vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
-    vld1.u8     {q0}, [r12]!
-
-    vand        q15, q15, q10
-
-    ;vp8_filter() function
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-;;;;;;;;;;;;;;
-    vld1.u8     {q10}, [r12]!
-
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vand        q3, q3, q9
-    vmovl.u8    q4, d20
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; q14: vp8_hevmask
-
-    ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; vp8_filter &= hev
-    vand        q15, q15, q3                ; q15: vp8_filter_mask
-    ;;
-    ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
-
-    ;vqadd.s8   q1, q1, q2
-    vaddw.s8    q2, q2, d2
-    vaddw.s8    q11, q11, d3
-
-    vld1.u8     {q9}, [r12]!
-    ;
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q11
-    ;;
-
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-    ;;
-;;;;;;;;;;;;
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
-;   vand        q2, q1, q4                  ; s = vp8_filter & 7
-;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
-    ;;;;
-;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
-;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
-    ;;
-;   ;calculate output
-;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
-;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; q10=3
-    vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
-
-    ;calculate output
-    vqadd.s8    q11, q6, q2             ; u = vp8_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
-
-    sub         r0, r0, r1, lsl #2
-    sub         r0, r0, r1, lsl #1
-    ;
-
-    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-
-    sub         r2, r2, r1, lsl #2
-    sub         r2, r2, r1, lsl #1
-    ;;
-
-    vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
-    ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
-    vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
-    ;
-
-    veor        q5, q13, q0                 ; *op1 = u^0x80
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    veor        q8, q12, q0                 ; *oq1 = u^0x80
-    ;
-
-    vst1.u8     {d10}, [r0], r1             ; store u op1
-    vst1.u8     {d11}, [r2], r1             ; store v op1
-    vst1.u8     {d12}, [r0], r1             ; store u op0
-    vst1.u8     {d13}, [r2], r1             ; store v op0
-    vst1.u8     {d14}, [r0], r1             ; store u oq0
-    vst1.u8     {d15}, [r2], r1             ; store v oq0
-    vst1.u8     {d16}, [r0], r1             ; store u oq1
-    vst1.u8     {d17}, [r2], r1             ; store v oq1
-
-    bx          lr
-    ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
-
-;-----------------
-    AREA    hloopfilteruv_dat, DATA, READWRITE          ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_lfhuv_coeff_
-    DCD     lfhuv_coeff
-lfhuv_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
-
-    END
diff --git a/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm b/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm
deleted file mode 100644
index a8314cdd7..000000000
--- a/vp8/common/arm/neon/loopfilterhorizontaledge_y_neon.asm
+++ /dev/null
@@ -1,189 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; //stack(r5)   int count --unused
-
-|vp8_loop_filter_horizontal_edge_y_neon| PROC
-    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
-    ldr         r12, [sp, #0]               ; load thresh pointer
-
-    vld1.u8     {q3}, [r0], r1              ; p3
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.u8     {q4}, [r0], r1              ; p2
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vld1.u8     {q5}, [r0], r1              ; p1
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.u8     {q6}, [r0], r1              ; p0
-    ldr         r12, _lfhy_coeff_
-    vld1.u8     {q7}, [r0], r1              ; q0
-
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vld1.u8     {q8}, [r0], r1              ; q1
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vld1.u8     {q9}, [r0], r1              ; q2
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vld1.u8     {q10}, [r0], r1             ; q3
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
-    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
-    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
-    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-
-    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
-    vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
-    vadd.u8     q0, q0, q0                  ; flimit * 2
-    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
-
-    vand        q15, q15, q12
-    vand        q10, q10, q11
-    vand        q3, q3, q4
-
-    vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
-    vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
-    vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-    vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
-    vld1.u8     {q0}, [r12]!
-
-    vand        q15, q15, q10
-
-    ;vp8_filter() function
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-;;;;;;;;;;;;;;
-    vld1.u8     {q10}, [r12]!
-
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vand        q3, q3, q9
-    vmovl.u8    q4, d20
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; q14: vp8_hevmask
-
-    ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; vp8_filter &= hev
-    vand        q15, q15, q3                ; q15: vp8_filter_mask
-    ;;
-    ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
-
-    ;vqadd.s8   q1, q1, q2
-    vaddw.s8    q2, q2, d2
-    vaddw.s8    q11, q11, d3
-
-    vld1.u8     {q9}, [r12]!
-    ;
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q11
-    ;;
-
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-    ;;
-;;;;;;;;;;;;
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
-;   vand        q2, q1, q4                  ; s = vp8_filter & 7
-;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
-    ;;;;
-;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
-;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
-    ;;
-;   ;calculate output
-;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
-;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; q10=3
-    vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
-
-    ;calculate output
-    vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
-
-    sub         r0, r0, r1, lsl #2
-    sub         r0, r0, r1, lsl #1
-    ;
-
-    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-    ;
-    add         r2, r1, r0
-
-    vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
-    ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
-    vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
-
-    add         r3, r2, r1
-
-    veor        q5, q13, q0                 ; *op1 = u^0x80
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    veor        q8, q12, q0                 ; *oq1 = u^0x80
-
-    add         r12, r3, r1
-
-    vst1.u8     {q5}, [r0]                  ; store op1
-    vst1.u8     {q6}, [r2]                  ; store op0
-    vst1.u8     {q7}, [r3]                  ; store oq0
-    vst1.u8     {q8}, [r12]                 ; store oq1
-
-    bx          lr
-    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-;-----------------
-    AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_lfhy_coeff_
-    DCD     lfhy_coeff
-lfhy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
-
-    END
diff --git a/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm b/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm
deleted file mode 100644
index 57913d2bc..000000000
--- a/vp8/common/arm/neon/loopfilterverticaledge_uv_neon.asm
+++ /dev/null
@@ -1,232 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *u,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; stack(r5) unsigned char *v
-
-|vp8_loop_filter_vertical_edge_uv_neon| PROC
-    sub         r0, r0, #4          ; move u pointer down by 4 columns
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-
-    ldr         r2, [sp, #4]                ; load v ptr
-    ldr         r12, [sp, #0]               ; load thresh pointer
-
-    sub         r2, r2, #4          ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ;load u data
-    vld1.u8     {d7}, [r2], r1              ;load v data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r2], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r2], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r2], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r2], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r2], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r2], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-
-    ldr         r12, _vlfuv_coeff_
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
-    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
-    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
-    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-
-    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
-    vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
-    vadd.u8     q0, q0, q0                  ; flimit * 2
-    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
-
-    vand        q15, q15, q12
-    vand        q10, q10, q11
-    vand        q3, q3, q4
-
-    vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
-    vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
-    vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-    vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
-    vld1.u8     {q0}, [r12]!
-
-    vand        q15, q15, q10
-
-
-    ;vp8_filter() function
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-;;;;;;;;;;;;;;
-    vld1.u8     {q10}, [r12]!
-
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vand        q3, q3, q9
-    vmovl.u8    q4, d20
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; q14: vp8_hevmask
-
-    ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; vp8_filter &= hev
-    vand        q15, q15, q3                ; q15: vp8_filter_mask
-    ;;
-    ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
-
-    ;vqadd.s8   q1, q1, q2
-    vaddw.s8    q2, q2, d2
-    vaddw.s8    q11, q11, d3
-
-    vld1.u8     {q9}, [r12]!
-    ;
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q11
-    ;;
-
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-    ;;
-;;;;;;;;;;;;
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
-;   vand        q2, q1, q4                  ; s = vp8_filter & 7
-;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
-    ;;;;
-;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
-;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
-    ;;
-;   ;calculate output
-;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
-;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; q10=3
-    vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
-    ;calculate output
-    vqadd.s8    q11, q6, q2             ; u = vp8_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
-
-    sub         r0, r0, r1, lsl #3
-    add         r0, r0, #2
-
-    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-
-    sub         r2, r2, r1, lsl #3
-    add         r2, r2, #2
-
-    vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
-    ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
-    vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
-
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    veor        q5, q13, q0                 ; *op1 = u^0x80
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q8, q12, q0                 ; *oq1 = u^0x80
-
-    vswp        d12, d11
-    vswp        d16, d13
-    vswp        d14, d12
-    vswp        d16, d15
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2], r1
-
-    bx          lr
-    ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
-
-;-----------------
-    AREA    vloopfilteruv_dat, DATA, READWRITE          ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_vlfuv_coeff_
-    DCD     vlfuv_coeff
-vlfuv_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
-
-    END
diff --git a/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm b/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm
deleted file mode 100644
index 2eb695ff0..000000000
--- a/vp8/common/arm/neon/loopfilterverticaledge_y_neon.asm
+++ /dev/null
@@ -1,236 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; //stack(r5)   int count --unused
-
-|vp8_loop_filter_vertical_edge_y_neon| PROC
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-    ldr         r12, [sp, #0]               ; load thresh pointer
-
-    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.u8     {d8}, [r0], r1
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vld1.u8     {d10}, [r0], r1
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.u8     {d12}, [r0], r1
-    ldr         r12, _vlfy_coeff_
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d20}, [r0], r1
-
-    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r0], r1
-    vld1.u8     {d11}, [r0], r1
-    vld1.u8     {d13}, [r0], r1
-    vld1.u8     {d15}, [r0], r1
-    vld1.u8     {d17}, [r0], r1
-    vld1.u8     {d19}, [r0], r1
-    vld1.u8     {d21}, [r0], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
-    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
-    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
-    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-
-    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
-    vcge.u8     q4, q1, q4                  ; (abs(q3 - q2) > limit)*-1
-    vadd.u8     q0, q0, q0                  ; flimit * 2
-    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
-
-    vand        q15, q15, q12
-    vand        q10, q10, q11
-    vand        q3, q3, q4
-
-    vabd.u8     q2, q5, q8                  ; abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; abs(p0 - q0) * 2
-    vshr.u8     q2, q2, #1                  ; abs(p1 - q1) / 2
-    vqadd.u8    q9, q9, q2                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-    vcge.u8     q9, q0, q9                  ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
-    vld1.u8     {q0}, [r12]!
-
-    vand        q15, q15, q10
-
-
-    ;vp8_filter() function
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-;;;;;;;;;;;;;;
-    vld1.u8     {q10}, [r12]!
-
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vand        q3, q3, q9
-    vmovl.u8    q4, d20
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; q14: vp8_hevmask
-
-    ;vmul.i8    q2, q2, q10                 ; 3 * ( qs0 - ps0)
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; vp8_filter &= hev
-    vand        q15, q15, q3                ; q15: vp8_filter_mask
-    ;;
-    ;vld1.u8        {q4}, [r12]!            ;no need 7 any more
-
-    ;vqadd.s8   q1, q1, q2
-    vaddw.s8    q2, q2, d2
-    vaddw.s8    q11, q11, d3
-
-    vld1.u8     {q9}, [r12]!
-    ;
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q11
-    ;;
-
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-    ;;
-;;;;;;;;;;;;
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
-;   vand        q2, q1, q4                  ; s = vp8_filter & 7
-;   vqadd.s8    q1, q1, q9                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
-    ;;;;
-;   vshr.s8     q1, q1, #3                  ; vp8_filter >>= 3
-;   vceq.i8     q2, q2, q9                  ; s = (s==4)*-1
-    ;;
-;   ;calculate output
-;   vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
-;   vqadd.s8    q11, q2, q1                 ; u = vp8_signed_char_clamp(s + vp8_filter)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; q10=3
-    vqadd.s8    q2, q1, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
-    ;calculate output
-    vqadd.s8    q11, q6, q2             ; u = vp8_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    vrshr.s8    q1, q1, #1                  ;round/shift:  vp8_filter += 1; vp8_filter >>= 1
-
-    sub         r0, r0, r1, lsl #4
-    add         r0, r0, #2
-    ;
-
-    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-    add         r2, r0, r1
-    ;
-
-    vqadd.s8    q13, q5, q1                 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
-    ;vqadd.s8   q11, q6, q11                ; u = vp8_signed_char_clamp(ps0 + u)
-    vqsub.s8    q12, q8, q1                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
-
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    veor        q5, q13, q0                 ; *op1 = u^0x80
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q8, q12, q0                 ; *oq1 = u^0x80
-    add         r3, r2, r1
-    ;
-    vswp        d12, d11
-    vswp        d16, d13
-    add         r12, r3, r1
-    vswp        d14, d12
-    vswp        d16, d15
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0]
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r2]
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r3]
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
-    add         r0, r12, r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r12]
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    add         r2, r0, r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0]
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r2], r1
-    add         r3, r2, r1
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2]
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r3], r1
-    add         r12, r3, r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r3]
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
-    add         r0, r12, r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r12]
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
-    add         r2, r0, r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
-
-    bx          lr
-    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
-
-;-----------------
-    AREA    vloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_vlfy_coeff_
-    DCD     vlfy_coeff
-vlfy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
-
-    END
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm
new file mode 100644
index 000000000..255dd5619
--- /dev/null
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -0,0 +1,519 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
+    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
+    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
+    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; flimit, limit, and thresh should be positive numbers.
+; All 16 elements in these variables are equal.
+
+; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
+;                                               const signed char *flimit,
+;                                               const signed char *limit,
+;                                               const signed char *thresh,
+;                                               int count)
+; r0    unsigned char *src,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  int count (unused)
+|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
+    stmdb       sp!, {lr}
+    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
+    ldr         r12, [sp, #4]               ; load thresh pointer
+
+    vld1.u8     {q3}, [r0], r1              ; p3
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    vld1.u8     {q4}, [r0], r1              ; p2
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {q5}, [r0], r1              ; p1
+    vld1.u8     {q6}, [r0], r1              ; p0
+    vld1.u8     {q7}, [r0], r1              ; q0
+    vld1.u8     {q8}, [r0], r1              ; q1
+    vld1.u8     {q9}, [r0], r1              ; q2
+    vld1.u8     {q10}, [r0], r1             ; q3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r0, r0, r1, lsl #3
+    add         r0, r0, r1
+    add         r2, r0, r1
+    add         r3, r2, r1
+
+    vst1.u8     {q4}, [r0]                  ; store op2
+    vst1.u8     {q5}, [r2]                  ; store op1
+    vst1.u8     {q6}, [r3], r1              ; store op0
+    add         r12, r3, r1
+    vst1.u8     {q7}, [r3]                  ; store oq0
+    vst1.u8     {q8}, [r12], r1             ; store oq1
+    vst1.u8     {q9}, [r12]             ; store oq2
+
+    ldmia       sp!, {pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
+
+; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
+;                                                const signed char *flimit,
+;                                                const signed char *limit,
+;                                                const signed char *thresh,
+;                                                unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
+    stmdb       sp!, {lr}
+    sub         r0, r0, r1, lsl #2          ; move u pointer down by 4 lines
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    ldr         r3, [sp, #8]                ; load v ptr
+    ldr         r12, [sp, #4]               ; load thresh pointer
+    sub         r3, r3, r1, lsl #2          ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r0], r1              ; p3
+    vld1.u8     {d7}, [r3], r1              ; p3
+    vld1.u8     {d8}, [r0], r1              ; p2
+    vld1.u8     {d9}, [r3], r1              ; p2
+    vld1.u8     {d10}, [r0], r1             ; p1
+    vld1.u8     {d11}, [r3], r1             ; p1
+    vld1.u8     {d12}, [r0], r1             ; p0
+    vld1.u8     {d13}, [r3], r1             ; p0
+    vld1.u8     {d14}, [r0], r1             ; q0
+    vld1.u8     {d15}, [r3], r1             ; q0
+    vld1.u8     {d16}, [r0], r1             ; q1
+    vld1.u8     {d17}, [r3], r1             ; q1
+    vld1.u8     {d18}, [r0], r1             ; q2
+    vld1.u8     {d19}, [r3], r1             ; q2
+    vld1.u8     {d20}, [r0], r1             ; q3
+    vld1.u8     {d21}, [r3], r1             ; q3
+
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r0, r0, r1, lsl #3
+    sub         r3, r3, r1, lsl #3
+
+    add         r0, r0, r1
+    add         r3, r3, r1
+
+    vst1.u8     {d8}, [r0], r1              ; store u op2
+    vst1.u8     {d9}, [r3], r1              ; store v op2
+    vst1.u8     {d10}, [r0], r1             ; store u op1
+    vst1.u8     {d11}, [r3], r1             ; store v op1
+    vst1.u8     {d12}, [r0], r1             ; store u op0
+    vst1.u8     {d13}, [r3], r1             ; store v op0
+    vst1.u8     {d14}, [r0], r1             ; store u oq0
+    vst1.u8     {d15}, [r3], r1             ; store v oq0
+    vst1.u8     {d16}, [r0], r1             ; store u oq1
+    vst1.u8     {d17}, [r3], r1             ; store v oq1
+    vst1.u8     {d18}, [r0], r1             ; store u oq2
+    vst1.u8     {d19}, [r3], r1             ; store v oq2
+
+    ldmia       sp!, {pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
+
+; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+;                                             const signed char *flimit,
+;                                             const signed char *limit,
+;                                             const signed char *thresh,
+;                                             int count)
+; r0    unsigned char *src,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  int count (unused)
+|vp8_mbloop_filter_vertical_edge_y_neon| PROC
+    stmdb       sp!, {lr}
+    sub         r0, r0, #4                  ; move src pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
+    ldr         r12, [sp, #4]               ; load thresh pointer
+    vld1.u8     {d8}, [r0], r1
+    sub         sp, sp, #32
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d20}, [r0], r1
+
+    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
+    vld1.u8     {d9}, [r0], r1
+    vld1.u8     {d11}, [r0], r1
+    vld1.u8     {d13}, [r0], r1
+    vld1.u8     {d15}, [r0], r1
+    vld1.u8     {d17}, [r0], r1
+    vld1.u8     {d19}, [r0], r1
+    vld1.u8     {d21}, [r0], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    mov         r12, sp
+    vst1.u8     {q3}, [r12]!
+    vst1.u8     {q10}, [r12]!
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r0, r0, r1, lsl #4
+
+    add         r2, r0, r1
+
+    add         r3, r2, r1
+
+    vld1.u8     {q3}, [sp]!
+    vld1.u8     {q10}, [sp]!
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+    add         r12, r3, r1
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0]
+    vst1.8      {d8}, [r2]
+    vst1.8      {d10}, [r3]
+    vst1.8      {d12}, [r12], r1
+    add         r0, r12, r1
+    vst1.8      {d14}, [r12]
+    vst1.8      {d16}, [r0], r1
+    add         r2, r0, r1
+    vst1.8      {d18}, [r0]
+    vst1.8      {d20}, [r2], r1
+    add         r3, r2, r1
+    vst1.8      {d7}, [r2]
+    vst1.8      {d9}, [r3], r1
+    add         r12, r3, r1
+    vst1.8      {d11}, [r3]
+    vst1.8      {d13}, [r12], r1
+    add         r0, r12, r1
+    vst1.8      {d15}, [r12]
+    vst1.8      {d17}, [r0], r1
+    add         r2, r0, r1
+    vst1.8      {d19}, [r0]
+    vst1.8      {d21}, [r2]
+
+    ldmia       sp!, {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+;                                              const signed char *flimit,
+;                                              const signed char *limit,
+;                                              const signed char *thresh,
+;                                              unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+    stmdb       sp!, {lr}
+    sub         r0, r0, #4                  ; move src pointer down by 4 columns
+    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    ldr         r3, [sp, #8]                ; load v ptr
+    ldr         r12, [sp, #4]               ; load thresh pointer
+
+    sub         r3, r3, #4                  ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ;load u data
+    vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r3], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         sp, sp, #32
+    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    mov         r12, sp
+    vst1.u8     {q3}, [r12]!
+    vst1.u8     {q10}, [r12]!
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r0, r0, r1, lsl #3
+    sub         r3, r3, r1, lsl #3
+
+    vld1.u8     {q3}, [sp]!
+    vld1.u8     {q10}, [sp]!
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r3], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r3], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r3], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r3], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r3], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r3], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r3], r1
+    vst1.8      {d20}, [r0], r1
+    vst1.8      {d21}, [r3], r1
+
+    ldmia       sp!, {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
+
+; void vp8_mbloop_filter_neon()
+; This is a helper function for the macroblock loopfilters. The individual
+; functions do the necessary load, transpose (if necessary), preserve (if
+; necessary) and store.
+
+; TODO:
+; The vertical filter writes p3/q3 back out because two 4 element writes are
+; much simpler than ordering and writing two 3 element sets (or three 2 elements
+; sets, or whichever other combinations are possible).
+; If we can preserve q3 and q10, the vertical filter will be able to avoid
+; storing those values on the stack and reading them back after the filter.
+
+; r0,r1 PRESERVE
+; r2    flimit
+; r3    PRESERVE
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+
+|vp8_mbloop_filter_neon| PROC
+    ldr         r12, _mblf_coeff_
+
+    ; vp8_filter_mask
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
+
+    vmax.u8     q11, q11, q12
+    vmax.u8     q12, q13, q14
+    vmax.u8     q3, q3, q0
+    vmax.u8     q15, q11, q12
+
+    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
+
+    ; vp8_hevmask
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
+    vmax.u8     q15, q15, q3
+
+    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
+
+    vld1.u8     {q0}, [r12]!
+
+    vadd.u8     q2, q2, q2                  ; flimit * 2
+    vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit
+    vcge.u8     q15, q1, q15
+
+    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
+    vshr.u8     q1, q1, #1                  ; a = a / 2
+    vqadd.u8    q12, q12, q1                ; a = b + a
+    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+
+    ; vp8_filter
+    ; convert to signed
+    veor        q7, q7, q0                  ; qs0
+    veor        q6, q6, q0                  ; ps0
+    veor        q5, q5, q0                  ; ps1
+    veor        q8, q8, q0                  ; qs1
+    veor        q4, q4, q0                  ; ps2
+    veor        q9, q9, q0                  ; qs2
+
+    vorr        q14, q13, q14               ; vp8_hevmask
+
+    vsubl.s8    q2, d14, d12                ; qs0 - ps0
+    vsubl.s8    q13, d15, d13
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
+
+    vadd.s16    q10, q2, q2                 ; 3 * (qs0 - ps0)
+    vadd.s16    q11, q13, q13
+    vand        q15, q15, q12               ; vp8_filter_mask
+
+    vadd.s16    q2, q2, q10
+    vadd.s16    q13, q13, q11
+
+    vld1.u8     {q12}, [r12]!               ; #3
+
+    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d3
+
+    vld1.u8     {q11}, [r12]!               ; #4
+
+    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q13
+
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vld1.u8     {q15}, [r12]!               ; #63
+    ;
+    vand        q13, q1, q14                ; Filter2 &= hev
+
+    vld1.u8     {d7}, [r12]!                ; #9
+
+    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
+    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
+
+    vld1.u8     {d6}, [r12]!                ; #18
+
+    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
+    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+
+    vmov        q10, q15
+    vmov        q12, q15
+
+    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
+
+    vld1.u8     {d5}, [r12]!                ; #27
+
+    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
+
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+
+    ; roughly 1/7th difference across boundary
+    ; roughly 2/7th difference across boundary
+    ; roughly 3/7th difference across boundary
+    vmov        q11, q15
+    vmov        q13, q15
+    vmov        q14, q15
+
+    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
+    vmlal.s8    q11, d3, d7
+    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
+    vmlal.s8    q13, d3, d6
+    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
+    vmlal.s8    q15, d3, d5
+    vqshrn.s16  d20, q10, #7                ; u = clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d21, q11, #7
+    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
+    vqshrn.s16  d25, q13, #7
+    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
+    vqshrn.s16  d29, q15, #7
+
+    vqsub.s8    q11, q9, q10                ; s = clamp(qs2 - u)
+    vqadd.s8    q10, q4, q10                ; s = clamp(ps2 + u)
+    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
+    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
+    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
+    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
+    veor        q9, q11, q0                 ; *oq2 = s^0x80
+    veor        q4, q10, q0                 ; *op2 = s^0x80
+    veor        q8, q13, q0                 ; *oq1 = s^0x80
+    veor        q5, q12, q0                 ; *op2 = s^0x80
+    veor        q7, q15, q0                 ; *oq0 = s^0x80
+    veor        q6, q14, q0                 ; *op0 = s^0x80
+
+    bx          lr
+    ENDP        ; |vp8_mbloop_filter_neon|
+
+    AREA    mbloopfilter_dat, DATA, READONLY
+_mblf_coeff_
+    DCD     mblf_coeff
+mblf_coeff
+    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
+    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
+    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
+    DCD     0x1b1b1b1b, 0x1b1b1b1b
+
+    END
diff --git a/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm b/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm
deleted file mode 100644
index 4576a6a8f..000000000
--- a/vp8/common/arm/neon/mbloopfilterhorizontaledge_uv_neon.asm
+++ /dev/null
@@ -1,258 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *u,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; stack(r5) unsigned char *v
-|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
-    sub         r0, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    ldr         r3, [sp, #4]                ; load v ptr
-    ldr         r12, [sp, #0]               ; load thresh pointer
-    sub         r3, r3, r1, lsl #2          ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r0], r1              ; p3
-    vld1.u8     {d7}, [r3], r1              ; p3
-    vld1.u8     {d8}, [r0], r1              ; p2
-    vld1.u8     {d9}, [r3], r1              ; p2
-    vld1.u8     {d10}, [r0], r1             ; p1
-    vld1.u8     {d11}, [r3], r1             ; p1
-    vld1.u8     {d12}, [r0], r1             ; p0
-    vld1.u8     {d13}, [r3], r1             ; p0
-    vld1.u8     {d14}, [r0], r1             ; q0
-    vld1.u8     {d15}, [r3], r1             ; q0
-    vld1.u8     {d16}, [r0], r1             ; q1
-    vld1.u8     {d17}, [r3], r1             ; q1
-    vld1.u8     {d18}, [r0], r1             ; q2
-    vld1.u8     {d19}, [r3], r1             ; q2
-    vld1.u8     {d20}, [r0], r1             ; q3
-    vld1.u8     {d21}, [r3], r1             ; q3
-
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-
-    ldr         r12, _mbhlfuv_coeff_
-
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
-
-    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
-    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
-    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
-    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
-    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
-    vcge.u8     q0, q1, q0                  ; (abs(q3 - q2) > limit)*-1
-
-    vand        q15, q15, q12
-
-    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-
-    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
-
-    vand        q10, q10, q11
-    vand        q3, q3, q0
-
-    vld1.u8     {q0}, [r12]!
-
-    vadd.u8     q2, q2, q2                  ; flimit * 2
-    vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit
-
-    vabd.u8     q1, q5, q8                  ; abs(p1 - q1)
-    vqadd.u8    q12, q12, q12               ; abs(p0 - q0) * 2
-    vshr.u8     q1, q1, #1                  ; abs(p1 - q1) / 2
-    vqadd.u8    q12, q12, q1                ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-    vcge.u8     q12, q2, q12                ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1
-
-    vand        q15, q15, q10
-
-    ;vp8_filter() function
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-    veor        q4, q4, q0                  ; ps2: p2 offset to convert to a signed value
-    veor        q9, q9, q0                  ; qs2: q2 offset to convert to a signed value
-;;;;;;;;;;;;;
-    vorr        q14, q13, q14               ; q14: vp8_hevmask
-
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q13, d15, d13
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
-    ;vadd.s8    q10, q2, q2                 ; 3 * ( qs0 - ps0)
-    vadd.s16    q10, q2, q2                 ; 3 * ( qs0 - ps0)
-    vadd.s16    q11, q13, q13
-
-    vand        q3, q3, q12
-
-    ;vadd.s8    q2, q2, q10
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q11
-
-    vld1.u8     {q12}, [r12]!               ;#3
-
-    ;vqadd.s8   q1, q1, q2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
-
-    vand        q15, q15, q3                ; q15: vp8_filter_mask
-    vld1.u8     {q11}, [r12]!               ;#4
-
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q13
-
-;;;;;;;;;;;;;;
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-
-    vld1.u8     {q15}, [r12]!               ;#63
-    ;
-    vand        q13, q1, q14                ; Filter2: q13; Filter2 &= hev
-
-    vld1.u8     {d7}, [r12]!                ;#9
-    ;
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
-;   vand        q2, q13, q12                ; s = Filter2 & 7
-
-;   vqadd.s8    q13, q13, q11               ; Filter2 = vp8_signed_char_clamp(Filter2+4)
-;   vld1.u8     {d6}, [r12]!                ;#18
-
-;   sub         r0, r0, r1, lsl #3
-;   sub         r3, r3, r1, lsl #3
-
-;   vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-;   vceq.i8     q2, q2, q11                 ; s = (s==4)*-1
-
-;   add         r0, r0, r1
-;   add         r3, r3, r1
-
-;   vqsub.s8    q7, q7, q13                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
-;   vqadd.s8    q11, q2, q13                ; u = vp8_signed_char_clamp(s + Filter2)
-
-;   vld1.u8     {d5}, [r12]!                ;#27
-;   vmov        q10, q15
-;   vmov        q12, q15
-
-;   vqadd.s8    q6, q6, q11                 ; ps0 = vp8_signed_char_clamp(ps0 + u)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    vqadd.s8    q2, q13, q11                ; Filter1 = vp8_signed_char_clamp(Filter2+4)
-    vqadd.s8    q13, q13, q12               ; Filter2 = vp8_signed_char_clamp(Filter2+3)
-
-    vld1.u8     {d6}, [r12]!                ;#18
-
-    sub         r0, r0, r1, lsl #3
-    sub         r3, r3, r1, lsl #3
-
-    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
-    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-
-    vmov        q10, q15
-    vmov        q12, q15
-
-    vqsub.s8    q7, q7, q2                  ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
-
-    vld1.u8     {d5}, [r12]!                ;#27
-
-    add         r0, r0, r1
-    add         r3, r3, r1
-
-    vqadd.s8    q6, q6, q13                 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    vbic        q1, q1, q14                 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter
-
-    ; roughly 1/7th difference across boundary
-    ; roughly 2/7th difference across boundary
-    ; roughly 3/7th difference across boundary
-    vmov        q11, q15
-    vmov        q13, q15
-    vmov        q14, q15
-
-    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
-    vmlal.s8    q11, d3, d7
-    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
-    vmlal.s8    q13, d3, d6
-    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
-    vmlal.s8    q15, d3, d5
-    vqshrn.s16  d20, q10, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d21, q11, #7
-    vqshrn.s16  d24, q12, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-    vqshrn.s16  d25, q13, #7
-    vqshrn.s16  d28, q14, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-    vqshrn.s16  d29, q15, #7
-
-    vqsub.s8    q11, q9, q10                ; s = vp8_signed_char_clamp(qs2 - u)
-    vqadd.s8    q10, q4, q10                ; s = vp8_signed_char_clamp(ps2 + u)
-    vqsub.s8    q13, q8, q12                ; s = vp8_signed_char_clamp(qs1 - u)
-    vqadd.s8    q12, q5, q12                ; s = vp8_signed_char_clamp(ps1 + u)
-    vqsub.s8    q15, q7, q14                ; s = vp8_signed_char_clamp(qs0 - u)
-    vqadd.s8    q14, q6, q14                ; s = vp8_signed_char_clamp(ps0 + u)
-    veor        q9, q11, q0                 ; *oq2 = s^0x80
-    veor        q4, q10, q0                 ; *op2 = s^0x80
-    veor        q8, q13, q0                 ; *oq1 = s^0x80
-    veor        q5, q12, q0                 ; *op2 = s^0x80
-    veor        q7, q15, q0                 ; *oq0 = s^0x80
-    veor        q6, q14, q0                 ; *op0 = s^0x80
-
-    vst1.u8     {d8}, [r0], r1              ; store u op2
-    vst1.u8     {d9}, [r3], r1              ; store v op2
-    vst1.u8     {d10}, [r0], r1             ; store u op1
-    vst1.u8     {d11}, [r3], r1             ; store v op1
-    vst1.u8     {d12}, [r0], r1             ; store u op0
-    vst1.u8     {d13}, [r3], r1             ; store v op0
-    vst1.u8     {d14}, [r0], r1             ; store u oq0
-    vst1.u8     {d15}, [r3], r1             ; store v oq0
-    vst1.u8     {d16}, [r0], r1             ; store u oq1
-    vst1.u8     {d17}, [r3], r1             ; store v oq1
-    vst1.u8     {d18}, [r0], r1             ; store u oq2
-    vst1.u8     {d19}, [r3], r1             ; store v oq2
-
-    bx          lr
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
-
-;-----------------
-    AREA    mbhloopfilteruv_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_mbhlfuv_coeff_
-    DCD     mbhlfuv_coeff
-mbhlfuv_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
-    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
-    DCD     0x1b1b1b1b, 0x1b1b1b1b
-
-    END
diff --git a/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm b/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm
deleted file mode 100644
index 8e85caa45..000000000
--- a/vp8/common/arm/neon/mbloopfilterhorizontaledge_y_neon.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; //stack(r5)   int count --unused
-|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
-    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
-    ldr         r12, [sp, #0]               ; load thresh pointer
-
-    vld1.u8     {q3}, [r0], r1              ; p3
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vld1.u8     {q4}, [r0], r1              ; p2
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.u8     {q5}, [r0], r1              ; p1
-    ldr         r12, _mbhlfy_coeff_
-    vld1.u8     {q6}, [r0], r1              ; p0
-
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vld1.u8     {q7}, [r0], r1              ; q0
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vld1.u8     {q8}, [r0], r1              ; q1
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vld1.u8     {q9}, [r0], r1              ; q2
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vld1.u8     {q10}, [r0], r1             ; q3
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
-
-    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
-    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
-    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
-    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
-    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
-    vcge.u8     q0, q1, q0                  ; (abs(q3 - q2) > limit)*-1
-
-    vand        q15, q15, q12
-
-    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-
-    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
-
-    vand        q10, q10, q11
-    vand        q3, q3, q0
-
-    vld1.u8     {q0}, [r12]!
-
-    vadd.u8     q2, q2, q2                  ; flimit * 2
-    vadd.u8     q2, q2, q1                  ; flimit * 2 + limit
-
-    vabd.u8     q1, q5, q8                  ; abs(p1 - q1)
-    vqadd.u8    q12, q12, q12               ; abs(p0 - q0) * 2
-    vshr.u8     q1, q1, #1                  ; abs(p1 - q1) / 2
-    vqadd.u8    q12, q12, q1                ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-    vcge.u8     q12, q2, q12                ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1
-
-    vand        q15, q15, q10
-
-    ;vp8_filter() function
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-    veor        q4, q4, q0                  ; ps2: p2 offset to convert to a signed value
-    veor        q9, q9, q0                  ; qs2: q2 offset to convert to a signed value
-;;;;;;;;;;;;;
-    vorr        q14, q13, q14               ; q14: vp8_hevmask
-
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q13, d15, d13
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
-    ;vadd.s8    q10, q2, q2                 ; 3 * ( qs0 - ps0)
-    vadd.s16    q10, q2, q2                 ; 3 * ( qs0 - ps0)
-    vadd.s16    q11, q13, q13
-
-    vand        q3, q3, q12
-
-    ;vadd.s8    q2, q2, q10
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q11
-
-    vld1.u8     {q12}, [r12]!               ;#3
-
-    ;vqadd.s8   q1, q1, q2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
-
-    vand        q15, q15, q3                ; q15: vp8_filter_mask
-    vld1.u8     {q11}, [r12]!               ;#4
-
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q13
-
-;;;;;;;;;;;;;;
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-
-    vld1.u8     {q15}, [r12]!               ;#63
-    ;
-    vand        q13, q1, q14                ; Filter2: q13; Filter2 &= hev
-
-    vld1.u8     {d7}, [r12]!                ;#9
-    sub         r0, r0, r1, lsl #3
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
-;   vand        q2, q13, q12                ; s = Filter2 & 7
-
-;   vqadd.s8    q13, q13, q11               ; Filter2 = vp8_signed_char_clamp(Filter2+4)
-;   vld1.u8     {d6}, [r12]!                ;#18
-
-;   add         r0, r0, r1
-;   add         r2, r0, r1
-
-;   vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-;   vceq.i8     q2, q2, q11                 ; s = (s==4)*-1
-
-;   add         r3, r2, r1
-
-;   vqsub.s8    q7, q7, q13                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
-;   vqadd.s8    q11, q2, q13                ; u = vp8_signed_char_clamp(s + Filter2)
-
-;   vld1.u8     {d5}, [r12]!                ;#27
-;   vmov        q10, q15
-;   vmov        q12, q15
-
-;   vqadd.s8    q6, q6, q11                 ; ps0 = vp8_signed_char_clamp(ps0 + u)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    vqadd.s8    q2, q13, q11                ; Filter1 = vp8_signed_char_clamp(Filter2+4)
-    vqadd.s8    q13, q13, q12               ; Filter2 = vp8_signed_char_clamp(Filter2+3)
-
-    vld1.u8     {d6}, [r12]!                ;#18
-    add         r0, r0, r1
-    add         r2, r0, r1
-
-    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
-    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-
-    vmov        q10, q15
-    vmov        q12, q15
-
-    vqsub.s8    q7, q7, q2                  ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
-
-    vld1.u8     {d5}, [r12]!                ;#27
-    add         r3, r2, r1
-
-    vqadd.s8    q6, q6, q13                 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    vbic        q1, q1, q14                 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter
-
-    ; roughly 1/7th difference across boundary
-    ; roughly 2/7th difference across boundary
-    ; roughly 3/7th difference across boundary
-    vmov        q11, q15
-    vmov        q13, q15
-    vmov        q14, q15
-
-    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
-    vmlal.s8    q11, d3, d7
-    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
-    vmlal.s8    q13, d3, d6
-    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
-    vmlal.s8    q15, d3, d5
-    vqshrn.s16  d20, q10, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d21, q11, #7
-    vqshrn.s16  d24, q12, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-    vqshrn.s16  d25, q13, #7
-    vqshrn.s16  d28, q14, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-    vqshrn.s16  d29, q15, #7
-
-    vqsub.s8    q11, q9, q10                ; s = vp8_signed_char_clamp(qs2 - u)
-    vqadd.s8    q10, q4, q10                ; s = vp8_signed_char_clamp(ps2 + u)
-    vqsub.s8    q13, q8, q12                ; s = vp8_signed_char_clamp(qs1 - u)
-    vqadd.s8    q12, q5, q12                ; s = vp8_signed_char_clamp(ps1 + u)
-    vqsub.s8    q15, q7, q14                ; s = vp8_signed_char_clamp(qs0 - u)
-    vqadd.s8    q14, q6, q14                ; s = vp8_signed_char_clamp(ps0 + u)
-    veor        q9, q11, q0                 ; *oq2 = s^0x80
-    veor        q4, q10, q0                 ; *op2 = s^0x80
-    veor        q5, q12, q0                 ; *op2 = s^0x80
-    veor        q6, q14, q0                 ; *op0 = s^0x80
-    veor        q8, q13, q0                 ; *oq1 = s^0x80
-    veor        q7, q15, q0                 ; *oq0 = s^0x80
-
-    vst1.u8     {q4}, [r0]                  ; store op2
-    vst1.u8     {q5}, [r2]                  ; store op1
-    vst1.u8     {q6}, [r3], r1              ; store op0
-    add         r12, r3, r1
-    vst1.u8     {q7}, [r3]                  ; store oq0
-    vst1.u8     {q8}, [r12], r1             ; store oq1
-    vst1.u8     {q9}, [r12]             ; store oq2
-
-    bx          lr
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
-
-;-----------------
-    AREA    mbhloopfiltery_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_mbhlfy_coeff_
-    DCD     mbhlfy_coeff
-mbhlfy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
-    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
-    DCD     0x1b1b1b1b, 0x1b1b1b1b
-
-    END
diff --git a/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm b/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm
deleted file mode 100644
index d9dbdcfe5..000000000
--- a/vp8/common/arm/neon/mbloopfilterverticaledge_uv_neon.asm
+++ /dev/null
@@ -1,297 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *u,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; stack(r5) unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    ldr         r3, [sp, #4]                ; load v ptr
-    ldr         r12, [sp, #0]               ; load thresh pointer
-
-    sub         r3, r3, #4                  ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ;load u data
-    vld1.u8     {d7}, [r3], r1              ;load v data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r3], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r3], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r3], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r3], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r3], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r3], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r3], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         sp, sp, #32
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vst1.u8     {q3}, [sp]!
-    ldr         r12, _mbvlfuv_coeff_
-    vst1.u8     {q10}, [sp]!
-
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
-
-    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
-    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
-    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
-    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
-    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
-    vcge.u8     q0, q1, q0                  ; (abs(q3 - q2) > limit)*-1
-
-    vand        q15, q15, q12
-
-    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-
-    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
-
-    vand        q10, q10, q11
-    vand        q3, q3, q0
-
-    vld1.u8     {q0}, [r12]!
-
-    vadd.u8     q2, q2, q2                  ; flimit * 2
-    vadd.u8     q2, q2, q1                  ; flimit * 2 + limit
-
-    vabd.u8     q1, q5, q8                  ; abs(p1 - q1)
-    vqadd.u8    q12, q12, q12               ; abs(p0 - q0) * 2
-    vshr.u8     q1, q1, #1                  ; abs(p1 - q1) / 2
-    vqadd.u8    q12, q12, q1                ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-    vcge.u8     q12, q2, q12                ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1
-
-    vand        q15, q15, q10
-
-    ;vp8_filter() function
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-    veor        q4, q4, q0                  ; ps2: p2 offset to convert to a signed value
-    veor        q9, q9, q0                  ; qs2: q2 offset to convert to a signed value
-;;;;;;;;;;;;;
-    vorr        q14, q13, q14               ; q14: vp8_hevmask
-
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q13, d15, d13
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
-    ;vadd.s8    q10, q2, q2                 ; 3 * ( qs0 - ps0)
-    vadd.s16    q10, q2, q2                 ; 3 * ( qs0 - ps0)
-    vadd.s16    q11, q13, q13
-
-    vand        q3, q3, q12
-
-    ;vadd.s8    q2, q2, q10
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q11
-
-    vld1.u8     {q12}, [r12]!               ;#3
-
-    ;vqadd.s8   q1, q1, q2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
-
-    vand        q15, q15, q3                ; q15: vp8_filter_mask
-    vld1.u8     {q11}, [r12]!               ;#4
-
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q13
-
-;;;;;;;;;;;;;;
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-
-    vld1.u8     {q15}, [r12]!               ;#63
-    ;
-    vand        q13, q1, q14                ; Filter2: q13; Filter2 &= hev
-
-    vld1.u8     {d7}, [r12]!                ;#9
-    ;
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
-;   vand        q2, q13, q12                ; s = Filter2 & 7
-
-;   vqadd.s8    q13, q13, q11               ; Filter2 = vp8_signed_char_clamp(Filter2+4)
-;   vld1.u8     {d6}, [r12]!                ;#18
-
-;   sub         r0, r0, r1, lsl #3
-;   sub         r3, r3, r1, lsl #3
-;   sub         sp, sp, #32
-
-;   vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-;   vceq.i8     q2, q2, q11                 ; s = (s==4)*-1
-
-;   vqsub.s8    q7, q7, q13                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
-;   vqadd.s8    q11, q2, q13                ; u = vp8_signed_char_clamp(s + Filter2)
-
-;   vld1.u8     {d5}, [r12]!                ;#27
-;   vmov        q10, q15
-;   vmov        q12, q15
-
-;   vqadd.s8    q6, q6, q11                 ; ps0 = vp8_signed_char_clamp(ps0 + u)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    vqadd.s8    q2, q13, q11                ; Filter1 = vp8_signed_char_clamp(Filter2+4)
-    vqadd.s8    q13, q13, q12               ; Filter2 = vp8_signed_char_clamp(Filter2+3)
-
-    vld1.u8     {d6}, [r12]!                ;#18
-
-    sub         r0, r0, r1, lsl #3
-    sub         r3, r3, r1, lsl #3
-
-    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
-    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-
-    vmov        q10, q15
-    vmov        q12, q15
-
-    vqsub.s8    q7, q7, q2                  ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
-
-    vld1.u8     {d5}, [r12]!                ;#27
-
-    sub         sp, sp, #32
-
-    vqadd.s8    q6, q6, q13                 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    vbic        q1, q1, q14                 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter
-
-    ; roughly 1/7th difference across boundary
-    ; roughly 2/7th difference across boundary
-    ; roughly 3/7th difference across boundary
-    vmov        q11, q15
-    vmov        q13, q15
-    vmov        q14, q15
-
-    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
-    vmlal.s8    q11, d3, d7
-    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
-    vmlal.s8    q13, d3, d6
-    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
-    vmlal.s8    q15, d3, d5
-    vqshrn.s16  d20, q10, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d21, q11, #7
-    vqshrn.s16  d24, q12, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-    vqshrn.s16  d25, q13, #7
-    vqshrn.s16  d28, q14, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-    vqshrn.s16  d29, q15, #7
-
-    vqsub.s8    q11, q9, q10                ; s = vp8_signed_char_clamp(qs2 - u)
-    vqadd.s8    q10, q4, q10                ; s = vp8_signed_char_clamp(ps2 + u)
-    vqsub.s8    q13, q8, q12                ; s = vp8_signed_char_clamp(qs1 - u)
-    vqadd.s8    q12, q5, q12                ; s = vp8_signed_char_clamp(ps1 + u)
-    vqsub.s8    q15, q7, q14                ; s = vp8_signed_char_clamp(qs0 - u)
-    vqadd.s8    q14, q6, q14                ; s = vp8_signed_char_clamp(ps0 + u)
-    veor        q9, q11, q0                 ; *oq2 = s^0x80
-    veor        q4, q10, q0                 ; *op2 = s^0x80
-    veor        q8, q13, q0                 ; *oq1 = s^0x80
-    veor        q5, q12, q0                 ; *op2 = s^0x80
-    veor        q7, q15, q0                 ; *oq0 = s^0x80
-    vld1.u8     {q3}, [sp]!
-    veor        q6, q14, q0                 ; *op0 = s^0x80
-    vld1.u8     {q10}, [sp]!
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r3], r1
-    vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r3], r1
-    vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r3], r1
-    vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r3], r1
-    vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r3], r1
-    vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r3], r1
-    vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r3], r1
-    vst1.8      {d20}, [r0], r1
-    vst1.8      {d21}, [r3], r1
-
-    bx          lr
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
-
-;-----------------
-    AREA    mbvloopfilteruv_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_mbvlfuv_coeff_
-    DCD     mbvlfuv_coeff
-mbvlfuv_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
-    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
-    DCD     0x1b1b1b1b, 0x1b1b1b1b
-
-    END
diff --git a/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm b/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm
deleted file mode 100644
index bdffc62ee..000000000
--- a/vp8/common/arm/neon/mbloopfilterverticaledge_y_neon.asm
+++ /dev/null
@@ -1,304 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; //stack(r5)   int count --unused
-|vp8_mbloop_filter_vertical_edge_y_neon| PROC
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
-    ldr         r12, [sp, #0]               ; load thresh pointer
-    vld1.u8     {d8}, [r0], r1
-    sub         sp, sp, #32
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d20}, [r0], r1
-
-    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r0], r1
-    vld1.u8     {d11}, [r0], r1
-    vld1.u8     {d13}, [r0], r1
-    vld1.u8     {d15}, [r0], r1
-    vld1.u8     {d17}, [r0], r1
-    vld1.u8     {d19}, [r0], r1
-    vld1.u8     {d21}, [r0], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vst1.u8     {q3}, [sp]!
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    ldr         r12, _mbvlfy_coeff_
-    vst1.u8     {q10}, [sp]!
-
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
-
-    vcge.u8     q15, q1, q11                ; (abs(p3 - p2) > limit)*-1
-    vcge.u8     q12, q1, q12                ; (abs(p2 - p1) > limit)*-1
-    vcge.u8     q10, q1, q13                ; (abs(p1 - p0) > limit)*-1
-    vcge.u8     q11, q1, q14                ; (abs(q1 - q0) > limit)*-1
-    vcge.u8     q3, q1, q3                  ; (abs(q2 - q1) > limit)*-1
-    vcge.u8     q0, q1, q0                  ; (abs(q3 - q2) > limit)*-1
-
-    vand        q15, q15, q12
-
-    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-
-    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
-
-    vand        q10, q10, q11
-    vand        q3, q3, q0
-
-    vld1.u8     {q0}, [r12]!
-
-    vadd.u8     q2, q2, q2                  ; flimit * 2
-    vadd.u8     q2, q2, q1                  ; flimit * 2 + limit
-
-    vabd.u8     q1, q5, q8                  ; abs(p1 - q1)
-    vqadd.u8    q12, q12, q12               ; abs(p0 - q0) * 2
-    vshr.u8     q1, q1, #1                  ; abs(p1 - q1) / 2
-    vqadd.u8    q12, q12, q1                ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-    vcge.u8     q12, q2, q12                ; (abs(p0 - q0)*2 + abs(p1 - q1)/2 > flimit*2 + limit)*-1
-
-    vand        q15, q15, q10
-
-    ;vp8_filter() function
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-    veor        q4, q4, q0                  ; ps2: p2 offset to convert to a signed value
-    veor        q9, q9, q0                  ; qs2: q2 offset to convert to a signed value
-;;;;;;;;;;;;;
-    vorr        q14, q13, q14               ; q14: vp8_hevmask
-
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q13, d15, d13
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
-    ;vadd.s8    q10, q2, q2                 ; 3 * ( qs0 - ps0)
-    vadd.s16    q10, q2, q2                 ; 3 * ( qs0 - ps0)
-    vadd.s16    q11, q13, q13
-
-    vand        q3, q3, q12
-
-    ;vadd.s8    q2, q2, q10
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q11
-
-    vld1.u8     {q12}, [r12]!               ;#3
-
-    ;vqadd.s8   q1, q1, q2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
-
-    vand        q15, q15, q3                ; q15: vp8_filter_mask
-    vld1.u8     {q11}, [r12]!               ;#4
-
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q13
-
-;;;;;;;;;;;;;;
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-
-    vld1.u8     {q15}, [r12]!               ;#63
-    ;
-    vand        q13, q1, q14                ; Filter2: q13; Filter2 &= hev
-
-    vld1.u8     {d7}, [r12]!                ;#9
-    ;
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Change for VP8 from VP7
-;   vand        q2, q13, q12                ; s = Filter2 & 7
-
-;   vqadd.s8    q13, q13, q11               ; Filter2 = vp8_signed_char_clamp(Filter2+4)
-;   vld1.u8     {d6}, [r12]!                ;#18
-
-;   sub         r0, r0, r1, lsl #4
-;   sub         sp, sp, #32
-;   add         r2, r0, r1
-
-;   vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-;   vceq.i8     q2, q2, q11                 ; s = (s==4)*-1
-
-;   add         r3, r2, r1
-
-;   vqsub.s8    q7, q7, q13                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
-;   vqadd.s8    q11, q2, q13                ; u = vp8_signed_char_clamp(s + Filter2)
-
-;   vld1.u8     {d5}, [r12]!                ;#27
-;   vmov        q10, q15
-;   vmov        q12, q15
-
-;   vqadd.s8    q6, q6, q11                 ; ps0 = vp8_signed_char_clamp(ps0 + u)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    vqadd.s8    q2, q13, q11                ; Filter1 = vp8_signed_char_clamp(Filter2+4)
-    vqadd.s8    q13, q13, q12               ; Filter2 = vp8_signed_char_clamp(Filter2+3)
-
-    vld1.u8     {d6}, [r12]!                ;#18
-    sub         r0, r0, r1, lsl #4
-    sub         sp, sp, #32
-
-    add         r2, r0, r1
-
-    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
-    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-
-    vmov        q10, q15
-    vmov        q12, q15
-
-    vqsub.s8    q7, q7, q2                  ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
-
-    vld1.u8     {d5}, [r12]!                ;#27
-    add         r3, r2, r1
-
-    vqadd.s8    q6, q6, q13                 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    vbic        q1, q1, q14                 ; Filter2: q1; vp8_filter &= ~hev; Filter2 = vp8_filter
-
-    ; roughly 1/7th difference across boundary
-    ; roughly 2/7th difference across boundary
-    ; roughly 3/7th difference across boundary
-    vmov        q11, q15
-    vmov        q13, q15
-    vmov        q14, q15
-
-    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
-    vmlal.s8    q11, d3, d7
-    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
-    vmlal.s8    q13, d3, d6
-    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
-    vmlal.s8    q15, d3, d5
-    vqshrn.s16  d20, q10, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d21, q11, #7
-    vqshrn.s16  d24, q12, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-    vqshrn.s16  d25, q13, #7
-    vqshrn.s16  d28, q14, #7                ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-    vqshrn.s16  d29, q15, #7
-
-    vqsub.s8    q11, q9, q10                ; s = vp8_signed_char_clamp(qs2 - u)
-    vqadd.s8    q10, q4, q10                ; s = vp8_signed_char_clamp(ps2 + u)
-    vqsub.s8    q13, q8, q12                ; s = vp8_signed_char_clamp(qs1 - u)
-    vqadd.s8    q12, q5, q12                ; s = vp8_signed_char_clamp(ps1 + u)
-    vqsub.s8    q15, q7, q14                ; s = vp8_signed_char_clamp(qs0 - u)
-    vqadd.s8    q14, q6, q14                ; s = vp8_signed_char_clamp(ps0 + u)
-    veor        q9, q11, q0                 ; *oq2 = s^0x80
-    veor        q4, q10, q0                 ; *op2 = s^0x80
-    veor        q8, q13, q0                 ; *oq1 = s^0x80
-    veor        q5, q12, q0                 ; *op2 = s^0x80
-    veor        q7, q15, q0                 ; *oq0 = s^0x80
-    vld1.u8     {q3}, [sp]!
-    veor        q6, q14, q0                 ; *op0 = s^0x80
-    vld1.u8     {q10}, [sp]!
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-    add         r12, r3, r1
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0]
-    vst1.8      {d8}, [r2]
-    vst1.8      {d10}, [r3]
-    vst1.8      {d12}, [r12], r1
-    add         r0, r12, r1
-    vst1.8      {d14}, [r12]
-    vst1.8      {d16}, [r0], r1
-    add         r2, r0, r1
-    vst1.8      {d18}, [r0]
-    vst1.8      {d20}, [r2], r1
-    add         r3, r2, r1
-    vst1.8      {d7}, [r2]
-    vst1.8      {d9}, [r3], r1
-    add         r12, r3, r1
-    vst1.8      {d11}, [r3]
-    vst1.8      {d13}, [r12], r1
-    add         r0, r12, r1
-    vst1.8      {d15}, [r12]
-    vst1.8      {d17}, [r0], r1
-    add         r2, r0, r1
-    vst1.8      {d19}, [r0]
-    vst1.8      {d21}, [r2]
-
-    bx          lr
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-;-----------------
-    AREA    mbvloopfiltery_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_mbvlfy_coeff_
-    DCD     mbvlfy_coeff
-mbvlfy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
-    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
-    DCD     0x1b1b1b1b, 0x1b1b1b1b
-
-    END
diff --git a/vp8/common/arm/neon/recon_neon.c b/vp8/common/arm/neon/recon_neon.c
new file mode 100644
index 000000000..f7930ee5f
--- /dev/null
+++ b/vp8/common/arm/neon/recon_neon.c
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "blockd.h"
+
+extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
+
+void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    unsigned char *pred_ptr = &x->predictor[0];
+    short *diff_ptr = &x->diff[0];
+    unsigned char *dst_ptr = x->dst.y_buffer;
+    unsigned char *udst_ptr = x->dst.u_buffer;
+    unsigned char *vdst_ptr = x->dst.v_buffer;
+    int ystride = x->dst.y_stride;
+    /*int uv_stride = x->dst.uv_stride;*/
+
+    vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
+}
diff --git a/vp8/common/arm/recon_arm.c b/vp8/common/arm/recon_arm.c
deleted file mode 100644
index 218898b44..000000000
--- a/vp8/common/arm/recon_arm.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "recon.h"
-#include "blockd.h"
-
-extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
-
-/*
-void vp8_recon16x16mby(MACROBLOCKD *x)
-{
-    int i;
-    for(i=0;i<16;i+=4)
-    {
-        //vp8_recon4b(&x->block[i]);
-        BLOCKD *b = &x->block[i];
-        vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-}
-*/
-void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    BLOCKD *b = &x->block[0];
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-    //b = &x->block[4];
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-    //b = &x->block[8];
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-    //b = &x->block[12];
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-
-#if HAVE_ARMV7
-void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    unsigned char *pred_ptr = &x->predictor[0];
-    short *diff_ptr = &x->diff[0];
-    unsigned char *dst_ptr = x->dst.y_buffer;
-    unsigned char *udst_ptr = x->dst.u_buffer;
-    unsigned char *vdst_ptr = x->dst.v_buffer;
-    int ystride = x->dst.y_stride;
-    //int uv_stride = x->dst.uv_stride;
-
-    vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
-}
-
-#else
-/*
-void vp8_recon16x16mb(MACROBLOCKD *x)
-{
-    int i;
-
-    for(i=0;i<16;i+=4)
-    {
-//      vp8_recon4b(&x->block[i]);
-        BLOCKD *b = &x->block[i];
-        vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-    }
-    for(i=16;i<24;i+=2)
-    {
-//      vp8_recon2b(&x->block[i]);
-        BLOCKD *b = &x->block[i];
-        vp8_recon2b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-}
-*/
-void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    BLOCKD *b = &x->block[0];
-
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b += 4;
-
-    //b = &x->block[16];
-
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b++;
-    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b++;
-    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b++;
-    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-#endif
diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h
index 18855a3c0..b46b7fc7d 100644
--- a/vp8/common/arm/recon_arm.h
+++ b/vp8/common/arm/recon_arm.h
@@ -21,6 +21,7 @@ extern prototype_copy_block(vp8_copy_mem8x8_v6);
 extern prototype_copy_block(vp8_copy_mem8x4_v6);
 extern prototype_copy_block(vp8_copy_mem16x16_v6);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_recon
 #define vp8_recon_recon vp8_recon_b_armv6
 
@@ -39,6 +40,7 @@ extern prototype_copy_block(vp8_copy_mem16x16_v6);
 #undef  vp8_recon_copy16x16
 #define vp8_recon_copy16x16 vp8_copy_mem16x16_v6
 #endif
+#endif
 
 #if HAVE_ARMV7
 extern prototype_recon_block(vp8_recon_b_neon);
@@ -49,6 +51,9 @@ extern prototype_copy_block(vp8_copy_mem8x8_neon);
 extern prototype_copy_block(vp8_copy_mem8x4_neon);
 extern prototype_copy_block(vp8_copy_mem16x16_neon);
 
+extern prototype_recon_macroblock(vp8_recon_mb_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_recon
 #define vp8_recon_recon vp8_recon_b_neon
 
@@ -66,6 +71,10 @@ extern prototype_copy_block(vp8_copy_mem16x16_neon);
 
 #undef  vp8_recon_copy16x16
 #define vp8_recon_copy16x16 vp8_copy_mem16x16_neon
+
+#undef  vp8_recon_recon_mb
+#define vp8_recon_recon_mb vp8_recon_mb_neon
+#endif
 #endif
 
 #endif
diff --git a/vp8/common/arm/reconintra4x4_arm.c b/vp8/common/arm/reconintra4x4_arm.c
deleted file mode 100644
index 8d968d7ad..000000000
--- a/vp8/common/arm/reconintra4x4_arm.c
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "recon.h"
-#include "vpx_mem/vpx_mem.h"
-#include "reconintra.h"
-
-void vp8_predict_intra4x4(BLOCKD *x,
-                          int b_mode,
-                          unsigned char *predictor)
-{
-    int i, r, c;
-
-    unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
-    unsigned char Left[4];
-    unsigned char top_left = Above[-1];
-
-    Left[0] = (*(x->base_dst))[x->dst - 1];
-    Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
-    Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
-    Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
-
-    switch (b_mode)
-    {
-    case B_DC_PRED:
-    {
-        int expected_dc = 0;
-
-        for (i = 0; i < 4; i++)
-        {
-            expected_dc += Above[i];
-            expected_dc += Left[i];
-        }
-
-        expected_dc = (expected_dc + 4) >> 3;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                predictor[c] = expected_dc;
-            }
-
-            predictor += 16;
-        }
-    }
-    break;
-    case B_TM_PRED:
-    {
-        // prediction similar to true_motion prediction
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                int pred = Above[c] - top_left + Left[r];
-
-                if (pred < 0)
-                    pred = 0;
-
-                if (pred > 255)
-                    pred = 255;
-
-                predictor[c] = pred;
-            }
-
-            predictor += 16;
-        }
-    }
-    break;
-
-    case B_VE_PRED:
-    {
-
-        unsigned int ap[4];
-        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
-        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
-        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
-        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-
-                predictor[c] = ap[c];
-            }
-
-            predictor += 16;
-        }
-
-    }
-    break;
-
-
-    case B_HE_PRED:
-    {
-
-        unsigned int lp[4];
-        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
-        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
-        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
-        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                predictor[c] = lp[r];
-            }
-
-            predictor += 16;
-        }
-    }
-    break;
-    case B_LD_PRED:
-    {
-        unsigned char *ptr = Above;
-        predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
-        predictor[0 * 16 + 1] =
-            predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
-        predictor[0 * 16 + 2] =
-            predictor[1 * 16 + 1] =
-                predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
-        predictor[0 * 16 + 3] =
-            predictor[1 * 16 + 2] =
-                predictor[2 * 16 + 1] =
-                    predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
-        predictor[1 * 16 + 3] =
-            predictor[2 * 16 + 2] =
-                predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
-        predictor[2 * 16 + 3] =
-            predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
-        predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
-    }
-    break;
-    case B_RD_PRED:
-    {
-
-        unsigned char pp[9];
-
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-        predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[3 * 16 + 1] =
-            predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[3 * 16 + 2] =
-            predictor[2 * 16 + 1] =
-                predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[3 * 16 + 3] =
-            predictor[2 * 16 + 2] =
-                predictor[1 * 16 + 1] =
-                    predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * 16 + 3] =
-            predictor[1 * 16 + 2] =
-                predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[1 * 16 + 3] =
-            predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-        predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-
-    }
-    break;
-    case B_VR_PRED:
-    {
-
-        unsigned char pp[9];
-
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-
-        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[3 * 16 + 1] =
-            predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * 16 + 1] =
-            predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
-        predictor[3 * 16 + 2] =
-            predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[2 * 16 + 2] =
-            predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
-        predictor[3 * 16 + 3] =
-            predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-        predictor[2 * 16 + 3] =
-            predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
-        predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-        predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
-
-    }
-    break;
-    case B_VL_PRED:
-    {
-
-        unsigned char *pp = Above;
-
-        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[2 * 16 + 0] =
-            predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[1 * 16 + 1] =
-            predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * 16 + 1] =
-            predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[3 * 16 + 1] =
-            predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[0 * 16 + 3] =
-            predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
-        predictor[1 * 16 + 3] =
-            predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-    case B_HD_PRED:
-    {
-        unsigned char pp[9];
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-
-        predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[2 * 16 + 0] =
-            predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[2 * 16 + 1] =
-            predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * 16 + 2] =
-            predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[2 * 16 + 3] =
-            predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[1 * 16 + 2] =
-            predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
-        predictor[1 * 16 + 3] =
-            predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-
-    case B_HU_PRED:
-    {
-        unsigned char *pp = Left;
-        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[0 * 16 + 2] =
-            predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[0 * 16 + 3] =
-            predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[1 * 16 + 2] =
-            predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[1 * 16 + 3] =
-            predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * 16 + 2] =
-            predictor[2 * 16 + 3] =
-                predictor[3 * 16 + 0] =
-                    predictor[3 * 16 + 1] =
-                        predictor[3 * 16 + 2] =
-                            predictor[3 * 16 + 3] = pp[3];
-    }
-    break;
-
-
-    }
-}
-// copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
-// to the right prediction have filled in pixels to use.
-void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
-{
-    unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
-
-    unsigned int *src_ptr = (unsigned int *)above_right;
-    unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
-    unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
-    unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);
-
-    *dst_ptr0 = *src_ptr;
-    *dst_ptr1 = *src_ptr;
-    *dst_ptr2 = *src_ptr;
-}
-
-
-
-/*
-void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    vp8_intra_prediction_down_copy(x);
-
-    for(i=0;i<16;i++)
-    {
-        BLOCKD *b = &x->block[i];
-
-        vp8_predict_intra4x4(b, x->block[i].bmi.mode,x->block[i].predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-
-    vp8_recon_intra_mbuv(x);
-
-}
-*/
-void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-    BLOCKD *b = &x->block[0];
-
-    vp8_intra_prediction_down_copy(x);
-
-    {
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-
-    vp8_recon_intra_mbuv(rtcd, x);
-
-}
diff --git a/vp8/common/arm/subpixel_arm.h b/vp8/common/arm/subpixel_arm.h
index 53600e547..6288538d0 100644
--- a/vp8/common/arm/subpixel_arm.h
+++ b/vp8/common/arm/subpixel_arm.h
@@ -22,6 +22,7 @@ extern prototype_subpixel_predict(vp8_bilinear_predict8x8_armv6);
 extern prototype_subpixel_predict(vp8_bilinear_predict8x4_armv6);
 extern prototype_subpixel_predict(vp8_bilinear_predict4x4_armv6);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_subpix_sixtap16x16
 #define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_armv6
 
@@ -46,6 +47,7 @@ extern prototype_subpixel_predict(vp8_bilinear_predict4x4_armv6);
 #undef  vp8_subpix_bilinear4x4
 #define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_armv6
 #endif
+#endif
 
 #if HAVE_ARMV7
 extern prototype_subpixel_predict(vp8_sixtap_predict16x16_neon);
@@ -57,6 +59,7 @@ extern prototype_subpixel_predict(vp8_bilinear_predict8x8_neon);
 extern prototype_subpixel_predict(vp8_bilinear_predict8x4_neon);
 extern prototype_subpixel_predict(vp8_bilinear_predict4x4_neon);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_subpix_sixtap16x16
 #define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_neon
 
@@ -81,5 +84,6 @@ extern prototype_subpixel_predict(vp8_bilinear_predict4x4_neon);
 #undef  vp8_subpix_bilinear4x4
 #define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_neon
 #endif
+#endif
 
 #endif
diff --git a/vp8/common/arm/systemdependent.c b/vp8/common/arm/systemdependent.c
deleted file mode 100644
index 1eed97e02..000000000
--- a/vp8/common/arm/systemdependent.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "g_common.h"
-#include "pragmas.h"
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
-
-void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
-
-void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
-
-void vp8_machine_specific_config(VP8_COMMON *ctx)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
-
-#if HAVE_ARMV7
-    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_neon;
-    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_neon;
-    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_neon;
-    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_neon;
-    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon;
-    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_neon;
-    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_neon;
-    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_neon;
-
-    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_neon;
-    rtcd->idct.idct16       = vp8_short_idct4x4llm_neon;
-    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_neon;
-    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_neon;
-
-    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
-    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_neon;
-    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon;
-    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_neon;
-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon;
-    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_neon;
-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon;
-    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_neon;
-
-    rtcd->recon.copy16x16   = vp8_copy_mem16x16_neon;
-    rtcd->recon.copy8x8     = vp8_copy_mem8x8_neon;
-    rtcd->recon.copy8x4     = vp8_copy_mem8x4_neon;
-    rtcd->recon.recon       = vp8_recon_b_neon;
-    rtcd->recon.recon2      = vp8_recon2b_neon;
-    rtcd->recon.recon4      = vp8_recon4b_neon;
-#elif HAVE_ARMV6
-
-    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_armv6;
-    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_armv6;
-    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_armv6;
-    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_armv6;
-    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6;
-    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_armv6;
-    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_armv6;
-    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_armv6;
-
-    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_v6;
-    rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;
-    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_armv6;
-    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_armv6;
-
-    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
-    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;
-    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
-    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;
-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
-    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;
-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
-    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;
-
-    rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;
-    rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
-    rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
-    rtcd->recon.recon       = vp8_recon_b_armv6;
-    rtcd->recon.recon2      = vp8_recon2b_armv6;
-    rtcd->recon.recon4      = vp8_recon4b_armv6;
-#else
-//pure c
-    rtcd->idct.idct1        = vp8_short_idct4x4llm_1_c;
-    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
-    rtcd->idct.idct1_scalar = vp8_dc_only_idct_c;
-    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
-    rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_c;
-
-    rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;
-    rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;
-    rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;
-    rtcd->recon.recon      = vp8_recon_b_c;
-    rtcd->recon.recon2      = vp8_recon2b_c;
-    rtcd->recon.recon4     = vp8_recon4b_c;
-
-    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
-    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
-    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_c;
-    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_c;
-    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;
-    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_c;
-    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_c;
-    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_c;
-
-    rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c;
-    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;
-    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
-    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_c;
-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;
-    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_c;
-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
-    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
-#endif
-
-#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
-    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
-    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
-    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
-    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
-#endif
-#endif
-
-#if HAVE_ARMV7
-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby_neon;
-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s_neon;
-#elif HAVE_ARMV6
-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
-#else
-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
-
-#endif
-
-}
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 4b7f1a359..a38f0b72b 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -24,7 +24,7 @@ void vpx_log(const char *format, ...);
 #define TRUE    1
 #define FALSE   0
 
-//#define DCPRED 1
+/*#define DCPRED 1*/
 #define DCPREDSIMTHRESH 0
 #define DCPREDCNTTHRESH 3
 
@@ -39,7 +39,7 @@ void vpx_log(const char *format, ...);
 #define MAX_REF_LF_DELTAS       4
 #define MAX_MODE_LF_DELTAS      4
 
-// Segment Feature Masks
+/* Segment Feature Masks */
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
 
@@ -75,11 +75,11 @@ typedef enum
 
 typedef enum
 {
-    DC_PRED,            // average of above and left pixels
-    V_PRED,             // vertical prediction
-    H_PRED,             // horizontal prediction
-    TM_PRED,            // Truemotion prediction
-    B_PRED,             // block based prediction, each block has its own prediction mode
+    DC_PRED,            /* average of above and left pixels */
+    V_PRED,             /* vertical prediction */
+    H_PRED,             /* horizontal prediction */
+    TM_PRED,            /* Truemotion prediction */
+    B_PRED,             /* block based prediction, each block has its own prediction mode */
 
     NEARESTMV,
     NEARMV,
@@ -90,16 +90,16 @@ typedef enum
     MB_MODE_COUNT
 } MB_PREDICTION_MODE;
 
-// Macroblock level features
+/* Macroblock level features */
 typedef enum
 {
-    MB_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
-    MB_LVL_ALT_LF = 1,              // Use alternate loop filter value...
-    MB_LVL_MAX = 2,                 // Number of MB level features supported
+    MB_LVL_ALT_Q = 0,               /* Use alternate Quantizer .... */
+    MB_LVL_ALT_LF = 1,              /* Use alternate loop filter value... */
+    MB_LVL_MAX = 2                  /* Number of MB level features supported */
 
 } MB_LVL_FEATURES;
 
-// Segment Feature Masks
+/* Segment Feature Masks */
 #define SEGMENT_ALTQ    0x01
 #define SEGMENT_ALT_LF  0x02
 
@@ -110,11 +110,11 @@ typedef enum
 
 typedef enum
 {
-    B_DC_PRED,          // average of above and left pixels
+    B_DC_PRED,          /* average of above and left pixels */
     B_TM_PRED,
 
-    B_VE_PRED,           // vertical prediction
-    B_HE_PRED,           // horizontal prediction
+    B_VE_PRED,           /* vertical prediction */
+    B_HE_PRED,           /* horizontal prediction */
 
     B_LD_PRED,
     B_RD_PRED,
@@ -169,14 +169,14 @@ typedef struct
         MV  as_mv;
     } mv;
 
-    char partitioning;
-    unsigned char mb_skip_coeff;                                //does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens
+    unsigned char partitioning;
+    unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
     unsigned char dc_diff;
     unsigned char need_to_clamp_mvs;
 
-    unsigned char segment_id;                  // Which set of segmentation parameters should be used for this MB
+    unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
 
-    unsigned char force_no_skip; //encoder only
+    unsigned char force_no_skip; /* encoder only */
 } MB_MODE_INFO;
 
 
@@ -195,9 +195,9 @@ typedef struct
     short *diff;
     short *reference;
 
-    short(*dequant)[4];
+    short *dequant;
 
-    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
     unsigned char **base_pre;
     int pre;
     int pre_stride;
@@ -214,17 +214,17 @@ typedef struct
 
 typedef struct
 {
-    DECLARE_ALIGNED(16, short, diff[400]);      // from idct diff
+    DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
     DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
-//not used    DECLARE_ALIGNED(16, short, reference[384]);
+/* not used    DECLARE_ALIGNED(16, short, reference[384]); */
     DECLARE_ALIGNED(16, short, qcoeff[400]);
     DECLARE_ALIGNED(16, short, dqcoeff[400]);
     DECLARE_ALIGNED(16, char,  eobs[25]);
 
-    // 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries.
+    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
     BLOCKD block[25];
 
-    YV12_BUFFER_CONFIG pre; // Filtered copy of previous frame reconstruction
+    YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
     YV12_BUFFER_CONFIG dst;
 
     MODE_INFO *mode_info_context;
@@ -235,39 +235,39 @@ typedef struct
     int up_available;
     int left_available;
 
-    // Y,U,V,Y2
+    /* Y,U,V,Y2 */
     ENTROPY_CONTEXT_PLANES *above_context;
     ENTROPY_CONTEXT_PLANES *left_context;
 
-    // 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active.
+    /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
     unsigned char segmentation_enabled;
 
-    // 0 (do not update) 1 (update) the macroblock segmentation map.
+    /* 0 (do not update) 1 (update) the macroblock segmentation map. */
     unsigned char update_mb_segmentation_map;
 
-    // 0 (do not update) 1 (update) the macroblock segmentation feature data.
+    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
     unsigned char update_mb_segmentation_data;
 
-    // 0 (do not update) 1 (update) the macroblock segmentation feature data.
+    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
     unsigned char mb_segement_abs_delta;
 
-    // Per frame flags that define which MB level features (such as quantizer or loop filter level)
-    // are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO
-    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];         // Probability Tree used to code Segment number
+    /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
+    /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
+    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];         /* Probability Tree used to code Segment number */
 
-    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            // Segment parameters
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            /* Segment parameters */
 
-    // mode_based Loop filter adjustment
+    /* mode_based Loop filter adjustment */
     unsigned char mode_ref_lf_delta_enabled;
     unsigned char mode_ref_lf_delta_update;
 
-    // Delta values have the range +/- MAX_LOOP_FILTER
-    //char ref_lf_deltas[MAX_REF_LF_DELTAS];                      // 0 = Intra, Last, GF, ARF
-    //char mode_lf_deltas[MAX_MODE_LF_DELTAS];                            // 0 = BPRED, ZERO_MV, MV, SPLIT
-    signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     // 0 = Intra, Last, GF, ARF
-    signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                           // 0 = BPRED, ZERO_MV, MV, SPLIT
+    /* Delta values have the range +/- MAX_LOOP_FILTER */
+    signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */
+    signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */
+    signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];                      /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+    signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                           /* 0 = BPRED, ZERO_MV, MV, SPLIT */
 
-    // Distance of MB away from frame edges
+    /* Distance of MB away from frame edges */
     int mb_to_left_edge;
     int mb_to_right_edge;
     int mb_to_top_edge;
diff --git a/vp8/common/debugmodes.c b/vp8/common/debugmodes.c
index c3ac88fc8..8c03480fa 100644
--- a/vp8/common/debugmodes.c
+++ b/vp8/common/debugmodes.c
@@ -21,7 +21,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
     int mb_index = 0;
     FILE *mvs = fopen("mvs.stt", "a");
 
-    // print out the macroblock Y modes
+    /* print out the macroblock Y modes */
     mb_index = 0;
     fprintf(mvs, "Mb Modes for Frame %d\n", frame);
 
@@ -60,7 +60,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
 
     fprintf(mvs, "\n");
 
-    // print out the macroblock UV modes
+    /* print out the macroblock UV modes */
     mb_index = 0;
     fprintf(mvs, "UV Modes for Frame %d\n", frame);
 
@@ -80,7 +80,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
 
     fprintf(mvs, "\n");
 
-    // print out the block modes
+    /* print out the block modes */
     mb_index = 0;
     fprintf(mvs, "Mbs for Frame %d\n", frame);
     {
@@ -108,7 +108,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
     }
     fprintf(mvs, "\n");
 
-    // print out the macroblock mvs
+    /* print out the macroblock mvs */
     mb_index = 0;
     fprintf(mvs, "MVs for Frame %d\n", frame);
 
@@ -128,7 +128,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
     fprintf(mvs, "\n");
 
 
-    // print out the block modes
+    /* print out the block modes */
     mb_index = 0;
     fprintf(mvs, "MVs for Frame %d\n", frame);
     {
diff --git a/vp8/common/defaultcoefcounts.h b/vp8/common/defaultcoefcounts.h
index b85f59b9f..ca58d565a 100644
--- a/vp8/common/defaultcoefcounts.h
+++ b/vp8/common/defaultcoefcounts.h
@@ -15,204 +15,204 @@ static const unsigned int default_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_C
 {
 
     {
-        // Block Type ( 0 )
+        /* Block Type ( 0 ) */
         {
-            // Coeff Band ( 0 )
+            /* Coeff Band ( 0 ) */
             {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
         },
         {
-            // Coeff Band ( 1 )
+            /* Coeff Band ( 1 ) */
             {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
             {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
             {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
         },
         {
-            // Coeff Band ( 2 )
+            /* Coeff Band ( 2 ) */
             {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
             {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
             {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
         },
         {
-            // Coeff Band ( 3 )
+            /* Coeff Band ( 3 ) */
             {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
             {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
             { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
         },
         {
-            // Coeff Band ( 4 )
+            /* Coeff Band ( 4 ) */
             {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
             {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
             { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
         },
         {
-            // Coeff Band ( 5 )
+            /* Coeff Band ( 5 ) */
             {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
             {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
             { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
         },
         {
-            // Coeff Band ( 6 )
+            /* Coeff Band ( 6 ) */
             {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
             {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
             { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
         },
         {
-            // Coeff Band ( 7 )
+            /* Coeff Band ( 7 ) */
             {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
             {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
         },
     },
     {
-        // Block Type ( 1 )
+        /* Block Type ( 1 ) */
         {
-            // Coeff Band ( 0 )
+            /* Coeff Band ( 0 ) */
             {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
             {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
             {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
         },
         {
-            // Coeff Band ( 1 )
+            /* Coeff Band ( 1 ) */
             {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
             {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
             {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
         },
         {
-            // Coeff Band ( 2 )
+            /* Coeff Band ( 2 ) */
             {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
             {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
             {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
         },
         {
-            // Coeff Band ( 3 )
+            /* Coeff Band ( 3 ) */
             {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
             {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
             {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
         },
         {
-            // Coeff Band ( 4 )
+            /* Coeff Band ( 4 ) */
             {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
             {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
             { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
         },
         {
-            // Coeff Band ( 5 )
+            /* Coeff Band ( 5 ) */
             {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
             {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
             { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
         },
         {
-            // Coeff Band ( 6 )
+            /* Coeff Band ( 6 ) */
             {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
             {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
             { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
         },
         {
-            // Coeff Band ( 7 )
+            /* Coeff Band ( 7 ) */
             {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
             {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
             {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
         },
     },
     {
-        // Block Type ( 2 )
+        /* Block Type ( 2 ) */
         {
-            // Coeff Band ( 0 )
+            /* Coeff Band ( 0 ) */
             { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
             {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
             {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
         },
         {
-            // Coeff Band ( 1 )
+            /* Coeff Band ( 1 ) */
             {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
             {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
             {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
         },
         {
-            // Coeff Band ( 2 )
+            /* Coeff Band ( 2 ) */
             { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
             { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
             { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
         },
         {
-            // Coeff Band ( 3 )
+            /* Coeff Band ( 3 ) */
             { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
             { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
         },
         {
-            // Coeff Band ( 4 )
+            /* Coeff Band ( 4 ) */
             {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
             {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
         },
         {
-            // Coeff Band ( 5 )
+            /* Coeff Band ( 5 ) */
             {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
             {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
         },
         {
-            // Coeff Band ( 6 )
+            /* Coeff Band ( 6 ) */
             { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
             {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
         },
         {
-            // Coeff Band ( 7 )
+            /* Coeff Band ( 7 ) */
             {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
         },
     },
     {
-        // Block Type ( 3 )
+        /* Block Type ( 3 ) */
         {
-            // Coeff Band ( 0 )
+            /* Coeff Band ( 0 ) */
             {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
             {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
             {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
         },
         {
-            // Coeff Band ( 1 )
+            /* Coeff Band ( 1 ) */
             {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
             {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
             {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
         },
         {
-            // Coeff Band ( 2 )
+            /* Coeff Band ( 2 ) */
             {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
             {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
             {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
         },
         {
-            // Coeff Band ( 3 )
+            /* Coeff Band ( 3 ) */
             {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
             {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
             {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
         },
         {
-            // Coeff Band ( 4 )
+            /* Coeff Band ( 4 ) */
             {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
             {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
             {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
         },
         {
-            // Coeff Band ( 5 )
+            /* Coeff Band ( 5 ) */
             {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
             {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
             {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
         },
         {
-            // Coeff Band ( 6 )
+            /* Coeff Band ( 6 ) */
             {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
             {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
             {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
         },
         {
-            // Coeff Band ( 7 )
+            /* Coeff Band ( 7 ) */
             {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
             {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
             {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h
index 80b481700..0685cd0ae 100644
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -17,18 +17,18 @@
 
 /* Coefficient token alphabet */
 
-#define ZERO_TOKEN              0       //0         Extra Bits 0+0
-#define ONE_TOKEN               1       //1         Extra Bits 0+1
-#define TWO_TOKEN               2       //2         Extra Bits 0+1
-#define THREE_TOKEN             3       //3         Extra Bits 0+1
-#define FOUR_TOKEN              4       //4         Extra Bits 0+1
-#define DCT_VAL_CATEGORY1       5       //5-6       Extra Bits 1+1
-#define DCT_VAL_CATEGORY2       6       //7-10      Extra Bits 2+1
-#define DCT_VAL_CATEGORY3       7       //11-26     Extra Bits 4+1
-#define DCT_VAL_CATEGORY4       8       //11-26     Extra Bits 5+1
-#define DCT_VAL_CATEGORY5       9       //27-58     Extra Bits 5+1
-#define DCT_VAL_CATEGORY6       10      //59+       Extra Bits 11+1
-#define DCT_EOB_TOKEN           11      //EOB       Extra Bits 0+0
+#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
+#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
+#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
+#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
+#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
+#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
+#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-26     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY4       8       /* 11-26     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY5       9       /* 27-58     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 59+       Extra Bits 11+1 */
+#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
 
 #define vp8_coef_tokens 12
 #define MAX_ENTROPY_TOKENS vp8_coef_tokens
@@ -83,7 +83,7 @@ extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
    coefficient band (and since zigzag positions 0, 1, and 2 are in
    distinct bands). */
 
-/*# define DC_TOKEN_CONTEXTS        3 // 00, 0!0, !0!0 */
+/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
 #   define PREV_COEF_CONTEXTS       3
 
 extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[vp8_coef_tokens]);
diff --git a/vp8/common/entropymv.c b/vp8/common/entropymv.c
index 8e72881e9..e5df1f095 100644
--- a/vp8/common/entropymv.c
+++ b/vp8/common/entropymv.c
@@ -29,21 +29,21 @@ const MV_CONTEXT vp8_mv_update_probs[2] =
 const MV_CONTEXT vp8_default_mv_context[2] =
 {
     {{
-        // row
-        162,                                        // is short
-        128,                                        // sign
-        225, 146, 172, 147, 214,  39, 156,          // short tree
-        128, 129, 132,  75, 145, 178, 206, 239, 254, 254 // long bits
+        /* row */
+        162,                                        /* is short */
+        128,                                        /* sign */
+        225, 146, 172, 147, 214,  39, 156,          /* short tree */
+        128, 129, 132,  75, 145, 178, 206, 239, 254, 254 /* long bits */
     }},
 
 
 
     {{
-        // same for column
-        164,                                        // is short
+        /* same for column */
+        164,                                        /* is short */
         128,
         204, 170, 119, 235, 140, 230, 228,
-        128, 130, 130,  74, 148, 180, 203, 236, 254, 254 // long bits
+        128, 130, 130,  74, 148, 180, 203, 236, 254, 254 /* long bits */
 
     }}
 };
diff --git a/vp8/common/extend.c b/vp8/common/extend.c
index 7e06ac30c..47207fa79 100644
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -15,14 +15,14 @@
 
 static void extend_plane_borders
 (
-    unsigned char *s, // source
-    int sp,           // pitch
-    int h,            // height
-    int w,            // width
-    int et,           // extend top border
-    int el,           // extend left border
-    int eb,           // extend bottom border
-    int er            // extend right border
+    unsigned char *s, /* source */
+    int sp,           /* pitch */
+    int h,            /* height */
+    int w,            /* width */
+    int et,           /* extend top border */
+    int el,           /* extend left border */
+    int eb,           /* extend bottom border */
+    int er            /* extend right border */
 )
 {
 
@@ -31,7 +31,7 @@ static void extend_plane_borders
     unsigned char *dest_ptr1, *dest_ptr2;
     int linesize;
 
-    // copy the left and right most columns out
+    /* copy the left and right most columns out */
     src_ptr1 = s;
     src_ptr2 = s + w - 1;
     dest_ptr1 = s - el;
@@ -39,8 +39,9 @@ static void extend_plane_borders
 
     for (i = 0; i < h - 0 + 1; i++)
     {
-        // Some linkers will complain if we call vpx_memset with el set to a
-        // constant 0.
+        /* Some linkers will complain if we call vpx_memset with el set to a
+         * constant 0.
+         */
         if (el)
             vpx_memset(dest_ptr1, src_ptr1[0], el);
         vpx_memset(dest_ptr2, src_ptr2[0], er);
@@ -50,7 +51,7 @@ static void extend_plane_borders
         dest_ptr2 += sp;
     }
 
-    // Now copy the top and bottom source lines into each line of the respective borders
+    /* Now copy the top and bottom source lines into each line of the respective borders */
     src_ptr1 = s - el;
     src_ptr2 = s + sp * (h - 1) - el;
     dest_ptr1 = s + sp * (-et) - el;
@@ -76,12 +77,12 @@ void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
     int er = 0xf & (16 - (width & 0xf));
     int eb = 0xf & (16 - (height & 0xf));
 
-    // check for non multiples of 16
+    /* check for non multiples of 16 */
     if (er != 0 || eb != 0)
     {
         extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er);
 
-        //adjust for uv
+        /* adjust for uv */
         height = (height + 1) >> 1;
         width  = (width  + 1) >> 1;
         er = 0x7 & (8 - (width  & 0x7));
@@ -95,7 +96,7 @@ void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
     }
 }
 
-// note the extension is only for the last row, for intra prediction purpose
+/* note the extension is only for the last row, for intra prediction purpose */
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr)
 {
     int i;
diff --git a/vp8/common/filter_c.c b/vp8/common/filter_c.c
index 3d18d8191..399a847d5 100644
--- a/vp8/common/filter_c.c
+++ b/vp8/common/filter_c.c
@@ -32,13 +32,13 @@ static const int bilinear_filters[8][2] =
 static const short sub_pel_filters[8][6] =
 {
 
-    { 0,  0,  128,    0,   0,  0 },         // note that 1/8 pel positions are just as per alpha -0.5 bicubic
+    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
     { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         // New 1/4 pel 6 tap filter
+    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
     { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         // New 1/2 pel 6 tap filter
+    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
     { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         // New 1/4 pel 6 tap filter
+    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
     { 0, -1,   12,  123,  -6,  0 },
 
 
@@ -69,9 +69,9 @@ void vp8_filter_block2d_first_pass
                    ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
                    ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
                    ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
-                   (VP8_FILTER_WEIGHT >> 1);      // Rounding
+                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
 
-            // Normalize back to 0-255
+            /* Normalize back to 0-255 */
             Temp = Temp >> VP8_FILTER_SHIFT;
 
             if (Temp < 0)
@@ -83,7 +83,7 @@ void vp8_filter_block2d_first_pass
             src_ptr++;
         }
 
-        // Next row...
+        /* Next row... */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_width;
     }
@@ -108,16 +108,16 @@ void vp8_filter_block2d_second_pass
     {
         for (j = 0; j < output_width; j++)
         {
-            // Apply filter
+            /* Apply filter */
             Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
                    ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
                    ((int)src_ptr[0]                 * vp8_filter[2]) +
                    ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
                    ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
                    ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
-                   (VP8_FILTER_WEIGHT >> 1);   // Rounding
+                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
 
-            // Normalize back to 0-255
+            /* Normalize back to 0-255 */
             Temp = Temp >> VP8_FILTER_SHIFT;
 
             if (Temp < 0)
@@ -129,7 +129,7 @@ void vp8_filter_block2d_second_pass
             src_ptr++;
         }
 
-        // Start next row
+        /* Start next row */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_pitch;
     }
@@ -146,12 +146,12 @@ void vp8_filter_block2d
     const short  *VFilter
 )
 {
-    int FData[9*4]; // Temp data bufffer used in filtering
+    int FData[9*4]; /* Temp data bufffer used in filtering */
 
-    // First filter 1-D horizontally...
+    /* First filter 1-D horizontally... */
     vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
 
-    // then filter verticaly...
+    /* then filter verticaly... */
     vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
 }
 
@@ -195,8 +195,8 @@ void vp8_sixtap_predict_c
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = sub_pel_filters[xoffset];   // 6 tap
-    VFilter = sub_pel_filters[yoffset];   // 6 tap
+    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
 
     vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
@@ -212,16 +212,16 @@ void vp8_sixtap_predict8x8_c
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[13*16];   // Temp data bufffer used in filtering
+    int FData[13*16];   /* Temp data bufffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   // 6 tap
-    VFilter = sub_pel_filters[yoffset];   // 6 tap
+    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
 
-    // First filter 1-D horizontally...
+    /* First filter 1-D horizontally... */
     vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
 
 
-    // then filter verticaly...
+    /* then filter verticaly... */
     vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
 
 }
@@ -238,16 +238,16 @@ void vp8_sixtap_predict8x4_c
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[13*16];   // Temp data bufffer used in filtering
+    int FData[13*16];   /* Temp data bufffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   // 6 tap
-    VFilter = sub_pel_filters[yoffset];   // 6 tap
+    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
 
-    // First filter 1-D horizontally...
+    /* First filter 1-D horizontally... */
     vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
 
 
-    // then filter verticaly...
+    /* then filter verticaly... */
     vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
 
 }
@@ -264,16 +264,16 @@ void vp8_sixtap_predict16x16_c
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[21*24];   // Temp data bufffer used in filtering
+    int FData[21*24];   /* Temp data bufffer used in filtering */
 
 
-    HFilter = sub_pel_filters[xoffset];   // 6 tap
-    VFilter = sub_pel_filters[yoffset];   // 6 tap
+    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
 
-    // First filter 1-D horizontally...
+    /* First filter 1-D horizontally... */
     vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
 
-    // then filter verticaly...
+    /* then filter verticaly... */
     vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
 
 }
@@ -324,14 +324,14 @@ void vp8_filter_block2d_bil_first_pass
     {
         for (j = 0; j < output_width; j++)
         {
-            // Apply bilinear filter
+            /* Apply bilinear filter */
             output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
                              ((int)src_ptr[pixel_step] * vp8_filter[1]) +
                              (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
             src_ptr++;
         }
 
-        // Next row...
+        /* Next row... */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_width;
     }
@@ -384,7 +384,7 @@ void vp8_filter_block2d_bil_second_pass
     {
         for (j = 0; j < output_width; j++)
         {
-            // Apply filter
+            /* Apply filter */
             Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
                    ((int)src_ptr[pixel_step] * vp8_filter[1]) +
                    (VP8_FILTER_WEIGHT / 2);
@@ -392,7 +392,7 @@ void vp8_filter_block2d_bil_second_pass
             src_ptr++;
         }
 
-        // Next row...
+        /* Next row... */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_pitch;
     }
@@ -432,12 +432,12 @@ void vp8_filter_block2d_bil
 )
 {
 
-    unsigned short FData[17*16];    // Temp data bufffer used in filtering
+    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */
 
-    // First filter 1-D horizontally...
+    /* First filter 1-D horizontally... */
     vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
 
-    // then 1-D vertically...
+    /* then 1-D vertically... */
     vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
 }
 
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index 41037f707..e63d4ef8d 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -168,7 +168,7 @@ void vp8_find_near_mvs
 
     vp8_clamp_mv(nearest, xd);
     vp8_clamp_mv(nearby, xd);
-    vp8_clamp_mv(best_mv, xd); //TODO: move this up before the copy
+    vp8_clamp_mv(best_mv, xd); /*TODO: move this up before the copy*/
 }
 
 vp8_prob *vp8_mv_ref_probs(
@@ -179,7 +179,7 @@ vp8_prob *vp8_mv_ref_probs(
     p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
     p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
     p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
-    //p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];
+    /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
     return p;
 }
 
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index c04e31ffe..b3eadaf27 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -18,6 +18,7 @@
 #include "onyxc_int.h"
 
 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
+extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
 
 void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
 extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
@@ -39,9 +40,11 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
     rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;
     rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;
     rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;
-    rtcd->recon.recon      = vp8_recon_b_c;
+    rtcd->recon.recon       = vp8_recon_b_c;
     rtcd->recon.recon2      = vp8_recon2b_c;
-    rtcd->recon.recon4     = vp8_recon4b_c;
+    rtcd->recon.recon4      = vp8_recon4b_c;
+    rtcd->recon.recon_mb    = vp8_recon_mb_c;
+    rtcd->recon.recon_mby   = vp8_recon_mby_c;
 
     rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
     rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
@@ -62,14 +65,17 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
     rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
 
 #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
-    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
-    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
-    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
-    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
+    rtcd->postproc.down             = vp8_mbpost_proc_down_c;
+    rtcd->postproc.across           = vp8_mbpost_proc_across_ip_c;
+    rtcd->postproc.downacross       = vp8_post_proc_down_and_across_c;
+    rtcd->postproc.addnoise         = vp8_plane_add_noise_c;
+    rtcd->postproc.blend_mb_inner   = vp8_blend_mb_inner_c;
+    rtcd->postproc.blend_mb_outer   = vp8_blend_mb_outer_c;
+    rtcd->postproc.blend_b          = vp8_blend_b_c;
 #endif
 
 #endif
-    // Pure C:
+    /* Pure C: */
     vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
     vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
 
@@ -77,4 +83,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
     vp8_arch_x86_common_init(ctx);
 #endif
 
+#if ARCH_ARM
+    vp8_arch_arm_common_init(ctx);
+#endif
+
 }
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
index 4cb433a80..81a3f2d89 100644
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -38,7 +38,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
 {
     int i;
 
-    // do 2nd order transform on the dc block
+    /* do 2nd order transform on the dc block */
     IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
 
     recon_dcblock(x);
@@ -68,7 +68,7 @@ void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x
     if (x->mode_info_context->mbmi.mode != B_PRED &&
         x->mode_info_context->mbmi.mode != SPLITMV)
     {
-        // do 2nd order transform on the dc block
+        /* do 2nd order transform on the dc block */
 
         IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
         recon_dcblock(x);
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index da9ca2871..f9d082304 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -23,7 +23,7 @@ prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
 prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
 prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
 
-// Horizontal MB filtering
+/* Horizontal MB filtering */
 void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -47,7 +47,7 @@ void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
 }
 
-// Vertical MB Filtering
+/* Vertical MB Filtering */
 void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -71,7 +71,7 @@ void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
     vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
 }
 
-// Horizontal B Filtering
+/* Horizontal B Filtering */
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -99,7 +99,7 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
     vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
 }
 
-// Vertical B Filtering
+/* Vertical B Filtering */
 void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -140,7 +140,7 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
     const int yhedge_boost  = 2;
     const int uvhedge_boost = 2;
 
-    // For each possible value for the loop filter fill out a "loop_filter_info" entry.
+    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
     for (i = 0; i <= MAX_LOOP_FILTER; i++)
     {
         int filt_lvl = i;
@@ -166,7 +166,7 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
                 HEVThresh = 0;
         }
 
-        // Set loop filter paramaeters that control sharpness.
+        /* Set loop filter paramaeters that control sharpness. */
         block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
         block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
 
@@ -195,7 +195,7 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
 
     }
 
-    // Set up the function pointers depending on the type of loop filtering selected
+    /* Set up the function pointers depending on the type of loop filtering selected */
     if (lft == NORMAL_LOOPFILTER)
     {
         cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v);
@@ -212,14 +212,15 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
     }
 }
 
-// Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
-// each frame. Check last_frame_type to skip the function most of times.
+/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
+ * each frame. Check last_frame_type to skip the function most of times.
+ */
 void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
 {
     int HEVThresh;
     int i, j;
 
-    // For each possible value for the loop filter fill out a "loop_filter_info" entry.
+    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
     for (i = 0; i <= MAX_LOOP_FILTER; i++)
     {
         int filt_lvl = i;
@@ -247,15 +248,15 @@ void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
 
         for (j = 0; j < 16; j++)
         {
-            //lfi[i].lim[j] = block_inside_limit;
-            //lfi[i].mbflim[j] = filt_lvl+yhedge_boost;
+            /*lfi[i].lim[j] = block_inside_limit;
+            lfi[i].mbflim[j] = filt_lvl+yhedge_boost;*/
             lfi[i].mbthr[j] = HEVThresh;
-            //lfi[i].flim[j] = filt_lvl;
+            /*lfi[i].flim[j] = filt_lvl;*/
             lfi[i].thr[j] = HEVThresh;
-            //lfi[i].uvlim[j] = block_inside_limit;
-            //lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;
+            /*lfi[i].uvlim[j] = block_inside_limit;
+            lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;*/
             lfi[i].uvmbthr[j] = HEVThresh;
-            //lfi[i].uvflim[j] = filt_lvl;
+            /*lfi[i].uvflim[j] = filt_lvl;*/
             lfi[i].uvthr[j] = HEVThresh;
         }
     }
@@ -268,32 +269,32 @@ void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level)
 
     if (mbd->mode_ref_lf_delta_enabled)
     {
-        // Aplly delta for reference frame
+        /* Apply delta for reference frame */
         *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
 
-        // Apply delta for mode
+        /* Apply delta for mode */
         if (mbmi->ref_frame == INTRA_FRAME)
         {
-            // Only the split mode BPRED has a further special case
+            /* Only the split mode BPRED has a further special case */
             if (mbmi->mode == B_PRED)
                 *filter_level +=  mbd->mode_lf_deltas[0];
         }
         else
         {
-            // Zero motion mode
+            /* Zero motion mode */
             if (mbmi->mode == ZEROMV)
                 *filter_level +=  mbd->mode_lf_deltas[1];
 
-            // Split MB motion mode
+            /* Split MB motion mode */
             else if (mbmi->mode == SPLITMV)
                 *filter_level +=  mbd->mode_lf_deltas[3];
 
-            // All other inter motion modes (Nearest, Near, New)
+            /* All other inter motion modes (Nearest, Near, New) */
             else
                 *filter_level +=  mbd->mode_lf_deltas[2];
         }
 
-        // Range check
+        /* Range check */
         if (*filter_level > MAX_LOOP_FILTER)
             *filter_level = MAX_LOOP_FILTER;
         else if (*filter_level < 0)
@@ -311,7 +312,7 @@ void vp8_loop_filter_frame
 {
     YV12_BUFFER_CONFIG *post = cm->frame_to_show;
     loop_filter_info *lfi = cm->lf_info;
-    int frame_type = cm->frame_type;
+    FRAME_TYPE frame_type = cm->frame_type;
 
     int mb_row;
     int mb_col;
@@ -324,21 +325,21 @@ void vp8_loop_filter_frame
     int i;
     unsigned char *y_ptr, *u_ptr, *v_ptr;
 
-    mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
+    mbd->mode_info_context = cm->mi;          /* Point at base of Mb MODE_INFO list */
 
-    // Note the baseline filter values for each segment
+    /* Note the baseline filter values for each segment */
     if (alt_flt_enabled)
     {
         for (i = 0; i < MAX_MB_SEGMENTS; i++)
         {
-            // Abs value
+            /* Abs value */
             if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
                 baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            // Delta Value
+            /* Delta Value */
             else
             {
                 baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
+                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
             }
         }
     }
@@ -348,18 +349,18 @@ void vp8_loop_filter_frame
             baseline_filter_level[i] = default_filt_lvl;
     }
 
-    // Initialize the loop filter for this frame.
+    /* Initialize the loop filter for this frame. */
     if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
         vp8_init_loop_filter(cm);
     else if (frame_type != cm->last_frame_type)
         vp8_frame_init_loop_filter(lfi, frame_type);
 
-    // Set up the buffer pointers
+    /* Set up the buffer pointers */
     y_ptr = post->y_buffer;
     u_ptr = post->u_buffer;
     v_ptr = post->v_buffer;
 
-    // vp8_filter each macro block
+    /* vp8_filter each macro block */
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
     {
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
@@ -368,9 +369,10 @@ void vp8_loop_filter_frame
 
             filter_level = baseline_filter_level[Segment];
 
-            // Distance of Mb to the various image edges.
-            // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
-            // Apply any context driven MB level adjustment
+            /* Distance of Mb to the various image edges.
+             * These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+             * Apply any context driven MB level adjustment
+             */
             vp8_adjust_mb_lf_value(mbd, &filter_level);
 
             if (filter_level)
@@ -381,7 +383,7 @@ void vp8_loop_filter_frame
                 if (mbd->mode_info_context->mbmi.dc_diff > 0)
                     cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
 
-                // don't apply across umv border
+                /* don't apply across umv border */
                 if (mb_row > 0)
                     cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
 
@@ -393,14 +395,14 @@ void vp8_loop_filter_frame
             u_ptr += 8;
             v_ptr += 8;
 
-            mbd->mode_info_context++;     // step to next MB
+            mbd->mode_info_context++;     /* step to next MB */
         }
 
         y_ptr += post->y_stride  * 16 - post->y_width;
         u_ptr += post->uv_stride *  8 - post->uv_width;
         v_ptr += post->uv_stride *  8 - post->uv_width;
 
-        mbd->mode_info_context++;         // Skip border mb
+        mbd->mode_info_context++;         /* Skip border mb */
     }
 }
 
@@ -424,26 +426,26 @@ void vp8_loop_filter_frame_yonly
     int baseline_filter_level[MAX_MB_SEGMENTS];
     int filter_level;
     int alt_flt_enabled = mbd->segmentation_enabled;
-    int frame_type = cm->frame_type;
+    FRAME_TYPE frame_type = cm->frame_type;
 
     (void) sharpness_lvl;
 
-    //MODE_INFO * this_mb_mode_info = cm->mi;  // Point at base of Mb MODE_INFO list
-    mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
+    /*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */
+    mbd->mode_info_context = cm->mi;          /* Point at base of Mb MODE_INFO list */
 
-    // Note the baseline filter values for each segment
+    /* Note the baseline filter values for each segment */
     if (alt_flt_enabled)
     {
         for (i = 0; i < MAX_MB_SEGMENTS; i++)
         {
-            // Abs value
+            /* Abs value */
             if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
                 baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            // Delta Value
+            /* Delta Value */
             else
             {
                 baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
+                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
             }
         }
     }
@@ -453,16 +455,16 @@ void vp8_loop_filter_frame_yonly
             baseline_filter_level[i] = default_filt_lvl;
     }
 
-    // Initialize the loop filter for this frame.
+    /* Initialize the loop filter for this frame. */
     if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
         vp8_init_loop_filter(cm);
     else if (frame_type != cm->last_frame_type)
         vp8_frame_init_loop_filter(lfi, frame_type);
 
-    // Set up the buffer pointers
+    /* Set up the buffer pointers */
     y_ptr = post->y_buffer;
 
-    // vp8_filter each macro block
+    /* vp8_filter each macro block */
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
     {
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
@@ -470,7 +472,7 @@ void vp8_loop_filter_frame_yonly
             int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
             filter_level = baseline_filter_level[Segment];
 
-            // Apply any context driven MB level adjustment
+            /* Apply any context driven MB level adjustment */
             vp8_adjust_mb_lf_value(mbd, &filter_level);
 
             if (filter_level)
@@ -481,7 +483,7 @@ void vp8_loop_filter_frame_yonly
                 if (mbd->mode_info_context->mbmi.dc_diff > 0)
                     cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
 
-                // don't apply across umv border
+                /* don't apply across umv border */
                 if (mb_row > 0)
                     cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
 
@@ -490,12 +492,12 @@ void vp8_loop_filter_frame_yonly
             }
 
             y_ptr += 16;
-            mbd->mode_info_context ++;        // step to next MB
+            mbd->mode_info_context ++;        /* step to next MB */
 
         }
 
         y_ptr += post->y_stride  * 16 - post->y_width;
-        mbd->mode_info_context ++;            // Skip border mb
+        mbd->mode_info_context ++;            /* Skip border mb */
     }
 
 }
@@ -516,7 +518,7 @@ void vp8_loop_filter_partial_frame
     unsigned char *y_ptr;
     int mb_row;
     int mb_col;
-    //int mb_rows = post->y_height >> 4;
+    /*int mb_rows = post->y_height >> 4;*/
     int mb_cols = post->y_width  >> 4;
 
     int linestocopy;
@@ -525,12 +527,12 @@ void vp8_loop_filter_partial_frame
     int baseline_filter_level[MAX_MB_SEGMENTS];
     int filter_level;
     int alt_flt_enabled = mbd->segmentation_enabled;
-    int frame_type = cm->frame_type;
+    FRAME_TYPE frame_type = cm->frame_type;
 
     (void) sharpness_lvl;
 
-    //MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);  // Point at base of Mb MODE_INFO list
-    mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);        // Point at base of Mb MODE_INFO list
+    /*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */
+    mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);        /* Point at base of Mb MODE_INFO list */
 
     linestocopy = (post->y_height >> (4 + Fraction));
 
@@ -539,19 +541,19 @@ void vp8_loop_filter_partial_frame
 
     linestocopy <<= 4;
 
-    // Note the baseline filter values for each segment
+    /* Note the baseline filter values for each segment */
     if (alt_flt_enabled)
     {
         for (i = 0; i < MAX_MB_SEGMENTS; i++)
         {
-            // Abs value
+            /* Abs value */
             if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
                 baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            // Delta Value
+            /* Delta Value */
             else
             {
                 baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
+                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
             }
         }
     }
@@ -561,16 +563,16 @@ void vp8_loop_filter_partial_frame
             baseline_filter_level[i] = default_filt_lvl;
     }
 
-    // Initialize the loop filter for this frame.
+    /* Initialize the loop filter for this frame. */
     if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
         vp8_init_loop_filter(cm);
     else if (frame_type != cm->last_frame_type)
         vp8_frame_init_loop_filter(lfi, frame_type);
 
-    // Set up the buffer pointers
+    /* Set up the buffer pointers */
     y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
 
-    // vp8_filter each macro block
+    /* vp8_filter each macro block */
     for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
     {
         for (mb_col = 0; mb_col < mb_cols; mb_col++)
@@ -593,10 +595,10 @@ void vp8_loop_filter_partial_frame
             }
 
             y_ptr += 16;
-            mbd->mode_info_context += 1;      // step to next MB
+            mbd->mode_info_context += 1;      /* step to next MB */
         }
 
         y_ptr += post->y_stride  * 16 - post->y_width;
-        mbd->mode_info_context += 1;          // Skip border mb
+        mbd->mode_info_context += 1;          /* Skip border mb */
     }
 }
diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h
index a2049bf61..e45683460 100644
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -22,10 +22,10 @@ typedef enum
     SIMPLE_LOOPFILTER = 1
 } LOOPFILTERTYPE;
 
-// FRK
-// Need to align this structure so when it is declared and
-// passed it can be loaded into vector registers.
-// FRK
+/* FRK
+ * Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
 typedef struct
 {
     DECLARE_ALIGNED(16, signed char, lim[16]);
@@ -119,8 +119,8 @@ typedef struct
 
 typedef void loop_filter_uvfunction
 (
-    unsigned char *u,   // source pointer
-    int p,              // pitch
+    unsigned char *u,   /* source pointer */
+    int p,              /* pitch */
     const signed char *flimit,
     const signed char *limit,
     const signed char *thresh,
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c
index ea82e2a07..694052924 100644
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -13,9 +13,6 @@
 #include "loopfilter.h"
 #include "onyxc_int.h"
 
-
-#define NEW_LOOPFILTER_MASK
-
 typedef unsigned char uc;
 
 static __inline signed char vp8_signed_char_clamp(int t)
@@ -26,7 +23,7 @@ static __inline signed char vp8_signed_char_clamp(int t)
 }
 
 
-// should we apply any filter at all ( 11111111 yes, 00000000 no)
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
 static __inline signed char vp8_filter_mask(signed char limit, signed char flimit,
                                      uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
 {
@@ -37,16 +34,12 @@ static __inline signed char vp8_filter_mask(signed char limit, signed char flimi
     mask |= (abs(q1 - q0) > limit) * -1;
     mask |= (abs(q2 - q1) > limit) * -1;
     mask |= (abs(q3 - q2) > limit) * -1;
-#ifndef NEW_LOOPFILTER_MASK
-    mask |= (abs(p0 - q0) > flimit) * -1;
-#else
     mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit) * -1;
-#endif
     mask = ~mask;
     return mask;
 }
 
-// is there high variance internal edge ( 11111111 yes, 00000000 no)
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
 static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
 {
     signed char hev = 0;
@@ -68,17 +61,18 @@ static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *
     qs0 = (signed char) * oq0 ^ 0x80;
     qs1 = (signed char) * oq1 ^ 0x80;
 
-    // add outer taps if we have high edge variance
+    /* add outer taps if we have high edge variance */
     vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
     vp8_filter &= hev;
 
-    // inner taps
+    /* inner taps */
     vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
     vp8_filter &= mask;
 
-    // save bottom 3 bits so that we round one side +4 and the other +3
-    // if it equals 4 we'll set to adjust by -1 to account for the fact
-    // we'd round 3 the other way
+    /* save bottom 3 bits so that we round one side +4 and the other +3
+     * if it equals 4 we'll set to adjust by -1 to account for the fact
+     * we'd round 3 the other way
+     */
     Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
     Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
     Filter1 >>= 3;
@@ -89,7 +83,7 @@ static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *
     *op0 = u ^ 0x80;
     vp8_filter = Filter1;
 
-    // outer tap adjustments
+    /* outer tap adjustments */
     vp8_filter += 1;
     vp8_filter >>= 1;
     vp8_filter &= ~hev;
@@ -103,19 +97,20 @@ static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *
 void vp8_loop_filter_horizontal_edge_c
 (
     unsigned char *s,
-    int p, //pitch
+    int p, /* pitch */
     const signed char *flimit,
     const signed char *limit,
     const signed char *thresh,
     int count
 )
 {
-    int  hev = 0; // high edge variance
+    int  hev = 0; /* high edge variance */
     signed char mask = 0;
     int i = 0;
 
-    // loop filter designed to work using chars so that we can make maximum use
-    // of 8 bit simd instructions.
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
     do
     {
         mask = vp8_filter_mask(limit[i], flimit[i],
@@ -141,12 +136,13 @@ void vp8_loop_filter_vertical_edge_c
     int count
 )
 {
-    int  hev = 0; // high edge variance
+    int  hev = 0; /* high edge variance */
     signed char mask = 0;
     int i = 0;
 
-    // loop filter designed to work using chars so that we can make maximum use
-    // of 8 bit simd instructions.
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
     do
     {
         mask = vp8_filter_mask(limit[i], flimit[i],
@@ -173,7 +169,7 @@ static __inline void vp8_mbfilter(signed char mask, signed char hev,
     signed char qs1 = (signed char) * oq1 ^ 0x80;
     signed char qs2 = (signed char) * oq2 ^ 0x80;
 
-    // add outer taps if we have high edge variance
+    /* add outer taps if we have high edge variance */
     vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
     vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
     vp8_filter &= mask;
@@ -181,7 +177,7 @@ static __inline void vp8_mbfilter(signed char mask, signed char hev,
     Filter2 = vp8_filter;
     Filter2 &= hev;
 
-    // save bottom 3 bits so that we round one side +4 and the other +3
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
     Filter1 = vp8_signed_char_clamp(Filter2 + 4);
     Filter2 = vp8_signed_char_clamp(Filter2 + 3);
     Filter1 >>= 3;
@@ -190,25 +186,25 @@ static __inline void vp8_mbfilter(signed char mask, signed char hev,
     ps0 = vp8_signed_char_clamp(ps0 + Filter2);
 
 
-    // only apply wider filter if not high edge variance
+    /* only apply wider filter if not high edge variance */
     vp8_filter &= ~hev;
     Filter2 = vp8_filter;
 
-    // roughly 3/7th difference across boundary
+    /* roughly 3/7th difference across boundary */
     u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
     s = vp8_signed_char_clamp(qs0 - u);
     *oq0 = s ^ 0x80;
     s = vp8_signed_char_clamp(ps0 + u);
     *op0 = s ^ 0x80;
 
-    // roughly 2/7th difference across boundary
+    /* roughly 2/7th difference across boundary */
     u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
     s = vp8_signed_char_clamp(qs1 - u);
     *oq1 = s ^ 0x80;
     s = vp8_signed_char_clamp(ps1 + u);
     *op1 = s ^ 0x80;
 
-    // roughly 1/7th difference across boundary
+    /* roughly 1/7th difference across boundary */
     u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
     s = vp8_signed_char_clamp(qs2 - u);
     *oq2 = s ^ 0x80;
@@ -226,12 +222,13 @@ void vp8_mbloop_filter_horizontal_edge_c
     int count
 )
 {
-    signed char hev = 0; // high edge variance
+    signed char hev = 0; /* high edge variance */
     signed char mask = 0;
     int i = 0;
 
-    // loop filter designed to work using chars so that we can make maximum use
-    // of 8 bit simd instructions.
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
     do
     {
 
@@ -260,7 +257,7 @@ void vp8_mbloop_filter_vertical_edge_c
     int count
 )
 {
-    signed char hev = 0; // high edge variance
+    signed char hev = 0; /* high edge variance */
     signed char mask = 0;
     int i = 0;
 
@@ -280,17 +277,14 @@ void vp8_mbloop_filter_vertical_edge_c
 
 }
 
-// should we apply any filter at all ( 11111111 yes, 00000000 no)
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
 static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
 {
-// Why does this cause problems for win32?
-// error C2143: syntax error : missing ';' before 'type'
-//  (void) limit;
-#ifndef NEW_LOOPFILTER_MASK
-    signed char mask = (abs(p0 - q0) <= flimit) * -1;
-#else
+/* Why does this cause problems for win32?
+ * error C2143: syntax error : missing ';' before 'type'
+ *  (void) limit;
+ */
     signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= flimit * 2 + limit) * -1;
-#endif
     return mask;
 }
 
@@ -307,7 +301,7 @@ static __inline void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *o
     vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
     vp8_filter &= mask;
 
-    // save bottom 3 bits so that we round one side +4 and the other +3
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
     Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
     Filter1 >>= 3;
     u = vp8_signed_char_clamp(q0 - Filter1);
@@ -335,7 +329,7 @@ void vp8_loop_filter_simple_horizontal_edge_c
 
     do
     {
-        //mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);
+        /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/
         mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
         vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
         ++s;
@@ -359,7 +353,7 @@ void vp8_loop_filter_simple_vertical_edge_c
 
     do
     {
-        //mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);
+        /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/
         mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]);
         vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
         s += p;
diff --git a/vp8/common/mbpitch.c b/vp8/common/mbpitch.c
index ce40d16ca..af55e2fe0 100644
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -14,7 +14,7 @@
 typedef enum
 {
     PRED = 0,
-    DEST = 1,
+    DEST = 1
 } BLOCKSET;
 
 void vp8_setup_block
@@ -62,13 +62,13 @@ void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
         v = &x->pre.v_buffer;
     }
 
-    for (block = 0; block < 16; block++) // y blocks
+    for (block = 0; block < 16; block++) /* y blocks */
     {
         vp8_setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
                         (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
     }
 
-    for (block = 16; block < 20; block++) // U and V blocks
+    for (block = 16; block < 20; block++) /* U and V blocks */
     {
         vp8_setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
                         ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
@@ -123,7 +123,7 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
 void vp8_build_block_doffsets(MACROBLOCKD *x)
 {
 
-    // handle the destination pitch features
+    /* handle the destination pitch features */
     vp8_setup_macroblock(x, DEST);
     vp8_setup_macroblock(x, PRED);
 }
diff --git a/vp8/common/modecont.c b/vp8/common/modecont.c
index 0fa299577..86a74bc0f 100644
--- a/vp8/common/modecont.c
+++ b/vp8/common/modecont.c
@@ -14,27 +14,27 @@
 const int vp8_mode_contexts[6][4] =
 {
     {
-        // 0
+        /* 0 */
         7,     1,     1,   143,
     },
     {
-        // 1
+        /* 1 */
         14,    18,    14,   107,
     },
     {
-        // 2
+        /* 2 */
         135,    64,    57,    68,
     },
     {
-        // 3
+        /* 3 */
         60,    56,   128,    65,
     },
     {
-        // 4
+        /* 4 */
         159,   134,   128,    34,
     },
     {
-        // 5
+        /* 5 */
         234,   188,   128,    28,
     },
 };
diff --git a/vp8/common/modecontext.c b/vp8/common/modecontext.c
index 8e483b800..a31a561c8 100644
--- a/vp8/common/modecontext.c
+++ b/vp8/common/modecontext.c
@@ -14,133 +14,133 @@
 const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =
 {
     {
-        //Above Mode :  0
-        { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, // left_mode 0
-        {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, // left_mode 1
-        {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, // left_mode 2
-        {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, // left_mode 3
-        {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, // left_mode 4
-        {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, // left_mode 5
-        {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, // left_mode 6
-        {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, // left_mode 7
-        {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, // left_mode 8
-        {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, // left_mode 9
+        /*Above Mode :  0*/
+        { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, /* left_mode 0 */
+        {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, /* left_mode 1 */
+        {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, /* left_mode 2 */
+        {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, /* left_mode 3 */
+        {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, /* left_mode 4 */
+        {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, /* left_mode 5 */
+        {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, /* left_mode 6 */
+        {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, /* left_mode 7 */
+        {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, /* left_mode 8 */
+        {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  1
-        {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, // left_mode 0
-        {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, // left_mode 1
-        {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, // left_mode 2
-        {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, // left_mode 3
-        {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, // left_mode 4
-        {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, // left_mode 5
-        {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, // left_mode 6
-        {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, // left_mode 7
-        {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, // left_mode 8
-        {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, // left_mode 9
+        /*Above Mode :  1*/
+        {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, /* left_mode 0 */
+        {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, /* left_mode 1 */
+        {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, /* left_mode 2 */
+        {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, /* left_mode 3 */
+        {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, /* left_mode 4 */
+        {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, /* left_mode 5 */
+        {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, /* left_mode 6 */
+        {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, /* left_mode 7 */
+        {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, /* left_mode 8 */
+        {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  2
-        {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, // left_mode 0
-        {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, // left_mode 1
-        {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, // left_mode 2
-        {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, // left_mode 3
-        {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, // left_mode 4
-        {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, // left_mode 5
-        {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, // left_mode 6
-        {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, // left_mode 7
-        {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, // left_mode 8
-        {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, // left_mode 9
+        /*Above Mode :  2*/
+        {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, /* left_mode 0 */
+        {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, /* left_mode 1 */
+        {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, /* left_mode 2 */
+        {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, /* left_mode 3 */
+        {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, /* left_mode 4 */
+        {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, /* left_mode 5 */
+        {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, /* left_mode 6 */
+        {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, /* left_mode 7 */
+        {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, /* left_mode 8 */
+        {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  3
-        {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, // left_mode 0
-        {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, // left_mode 1
-        {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, // left_mode 2
-        {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, // left_mode 3
-        {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, // left_mode 4
-        {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, // left_mode 5
-        {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, // left_mode 6
-        {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, // left_mode 7
-        {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, // left_mode 8
-        {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, // left_mode 9
+        /*Above Mode :  3*/
+        {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, /* left_mode 0 */
+        {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, /* left_mode 1 */
+        {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, /* left_mode 2 */
+        {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, /* left_mode 3 */
+        {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, /* left_mode 4 */
+        {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, /* left_mode 5 */
+        {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, /* left_mode 6 */
+        {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, /* left_mode 7 */
+        {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, /* left_mode 8 */
+        {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  4
-        {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, // left_mode 0
-        {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, // left_mode 1
-        {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, // left_mode 2
-        {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, // left_mode 3
-        {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, // left_mode 4
-        {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, // left_mode 5
-        {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, // left_mode 6
-        {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, // left_mode 7
-        {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, // left_mode 8
-        {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, // left_mode 9
+        /*Above Mode :  4*/
+        {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, /* left_mode 0 */
+        {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, /* left_mode 1 */
+        {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, /* left_mode 2 */
+        {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, /* left_mode 3 */
+        {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, /* left_mode 4 */
+        {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, /* left_mode 5 */
+        {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, /* left_mode 6 */
+        {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, /* left_mode 7 */
+        {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, /* left_mode 8 */
+        {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  5
-        {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, // left_mode 0
-        {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, // left_mode 1
-        {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, // left_mode 2
-        {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, // left_mode 3
-        {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, // left_mode 4
-        {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, // left_mode 5
-        {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, // left_mode 6
-        {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, // left_mode 7
-        {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, // left_mode 8
-        {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, // left_mode 9
+        /*Above Mode :  5*/
+        {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, /* left_mode 0 */
+        {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, /* left_mode 1 */
+        {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, /* left_mode 2 */
+        {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, /* left_mode 3 */
+        {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, /* left_mode 4 */
+        {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, /* left_mode 5 */
+        {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, /* left_mode 6 */
+        {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, /* left_mode 7 */
+        {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, /* left_mode 8 */
+        {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  6
-        {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, // left_mode 0
-        {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, // left_mode 1
-        {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, // left_mode 2
-        {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, // left_mode 3
-        {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, // left_mode 4
-        {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, // left_mode 5
-        {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, // left_mode 6
-        {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, // left_mode 7
-        {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, // left_mode 8
-        {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, // left_mode 9
+        /*Above Mode :  6*/
+        {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, /* left_mode 0 */
+        {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, /* left_mode 1 */
+        {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, /* left_mode 2 */
+        {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, /* left_mode 3 */
+        {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, /* left_mode 4 */
+        {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, /* left_mode 5 */
+        {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, /* left_mode 6 */
+        {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, /* left_mode 7 */
+        {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, /* left_mode 8 */
+        {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  7
-        {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, // left_mode 0
-        {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, // left_mode 1
-        {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, // left_mode 2
-        {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, // left_mode 3
-        {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, // left_mode 4
-        {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, // left_mode 5
-        {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, // left_mode 6
-        {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, // left_mode 7
-        {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, // left_mode 8
-        {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, // left_mode 9
+        /*Above Mode :  7*/
+        {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, /* left_mode 0 */
+        {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, /* left_mode 1 */
+        {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, /* left_mode 2 */
+        {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, /* left_mode 3 */
+        {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, /* left_mode 4 */
+        {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, /* left_mode 5 */
+        {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, /* left_mode 6 */
+        {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, /* left_mode 7 */
+        {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, /* left_mode 8 */
+        {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  8
-        {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, // left_mode 0
-        {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, // left_mode 1
-        {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, // left_mode 2
-        {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, // left_mode 3
-        {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, // left_mode 4
-        {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, // left_mode 5
-        {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, // left_mode 6
-        {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, // left_mode 7
-        {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, // left_mode 8
-        {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, // left_mode 9
+        /*Above Mode :  8*/
+        {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, /* left_mode 0 */
+        {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, /* left_mode 1 */
+        {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, /* left_mode 2 */
+        {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, /* left_mode 3 */
+        {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, /* left_mode 4 */
+        {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, /* left_mode 5 */
+        {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, /* left_mode 6 */
+        {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, /* left_mode 7 */
+        {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, /* left_mode 8 */
+        {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, /* left_mode 9 */
     },
     {
-        //Above Mode :  9
-        {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, // left_mode 0
-        {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, // left_mode 1
-        {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, // left_mode 2
-        {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, // left_mode 3
-        {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, // left_mode 4
-        {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, // left_mode 5
-        {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, // left_mode 6
-        {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, // left_mode 7
-        {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, // left_mode 8
-        {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, // left_mode 9
+        /*Above Mode :  9*/
+        {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, /* left_mode 0 */
+        {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, /* left_mode 1 */
+        {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, /* left_mode 2 */
+        {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, /* left_mode 3 */
+        {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, /* left_mode 4 */
+        {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, /* left_mode 5 */
+        {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, /* left_mode 6 */
+        {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, /* left_mode 7 */
+        {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, /* left_mode 8 */
+        {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, /* left_mode 9 */
     },
 };
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 132765d18..7e44c1f0c 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -21,9 +21,9 @@
 #include "recon.h"
 #include "postproc.h"
 
-//#ifdef PACKET_TESTING
+/*#ifdef PACKET_TESTING*/
 #include "header.h"
-//#endif
+/*#endif*/
 
 /* Create/destroy static data structures. */
 
@@ -43,7 +43,7 @@ typedef struct frame_contexts
     vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
     vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
     MV_CONTEXT mvc[2];
-    MV_CONTEXT pre_mvc[2];  //not to caculate the mvcost for the frame if mvc doesn't change.
+    MV_CONTEXT pre_mvc[2];  /* not to caculate the mvcost for the frame if mvc doesn't change. */
 } FRAME_CONTEXT;
 
 typedef enum
@@ -74,6 +74,7 @@ typedef struct VP8_COMMON_RTCD
     vp8_subpix_rtcd_vtable_t      subpix;
     vp8_loopfilter_rtcd_vtable_t  loopfilter;
     vp8_postproc_rtcd_vtable_t    postproc;
+    int                           flags;
 #else
     int unused;
 #endif
@@ -83,9 +84,9 @@ typedef struct VP8Common
 {
     struct vpx_internal_error_info  error;
 
-    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
 
     int Width;
     int Height;
@@ -104,7 +105,7 @@ typedef struct VP8Common
     YV12_BUFFER_CONFIG post_proc_buffer;
     YV12_BUFFER_CONFIG temp_scale_frame;
 
-    FRAME_TYPE last_frame_type;  //Add to check if vp8_frame_init_loop_filter() can be skipped.
+    FRAME_TYPE last_frame_type;  /* Add to check if vp8_frame_init_loop_filter() can be skipped. */
     FRAME_TYPE frame_type;
 
     int show_frame;
@@ -115,7 +116,7 @@ typedef struct VP8Common
     int mb_cols;
     int mode_info_stride;
 
-    // prfile settings
+    /* profile settings */
     int mb_no_coeff_skip;
     int no_lpf;
     int simpler_lpf;
@@ -123,7 +124,7 @@ typedef struct VP8Common
     int full_pixel;
 
     int base_qindex;
-    int last_kf_gf_q;  // Q used on the last GF or KF
+    int last_kf_gf_q;  /* Q used on the last GF or KF */
 
     int y1dc_delta_q;
     int y2dc_delta_q;
@@ -153,31 +154,31 @@ typedef struct VP8Common
     int last_sharpness_level;
     int sharpness_level;
 
-    int refresh_last_frame;       // Two state 0 = NO, 1 = YES
-    int refresh_golden_frame;     // Two state 0 = NO, 1 = YES
-    int refresh_alt_ref_frame;     // Two state 0 = NO, 1 = YES
+    int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */
+    int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */
+    int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */
 
-    int copy_buffer_to_gf;         // 0 none, 1 Last to GF, 2 ARF to GF
-    int copy_buffer_to_arf;        // 0 none, 1 Last to ARF, 2 GF to ARF
+    int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */
+    int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */
 
-    int refresh_entropy_probs;    // Two state 0 = NO, 1 = YES
+    int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
 
-    int ref_frame_sign_bias[MAX_REF_FRAMES];    // Two state 0, 1
+    int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
 
-    // Y,U,V,Y2
-    ENTROPY_CONTEXT_PLANES *above_context;   // row of context for each plane
-    ENTROPY_CONTEXT_PLANES left_context;  // (up to) 4 contexts ""
+    /* Y,U,V,Y2 */
+    ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
+    ENTROPY_CONTEXT_PLANES left_context;  /* (up to) 4 contexts "" */
 
 
-    // keyframe block modes are predicted by their above, left neighbors
+    /* keyframe block modes are predicted by their above, left neighbors */
 
     vp8_prob kf_bmode_prob [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1];
     vp8_prob kf_ymode_prob [VP8_YMODES-1];  /* keyframe "" */
     vp8_prob kf_uv_mode_prob [VP8_UV_MODES-1];
 
 
-    FRAME_CONTEXT lfc; // last frame entropy
-    FRAME_CONTEXT fc;  // this frame entropy
+    FRAME_CONTEXT lfc; /* last frame entropy */
+    FRAME_CONTEXT fc;  /* this frame entropy */
 
     unsigned int current_video_frame;
 
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 0c8cf13bf..e797e1036 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -19,7 +19,53 @@
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
-// global constants
+
+#define RGB_TO_YUV(t)                                                                       \
+    ( (0.257*(float)(t>>16)) + (0.504*(float)(t>>8&0xff)) + (0.098*(float)(t&0xff)) + 16),  \
+    (-(0.148*(float)(t>>16)) - (0.291*(float)(t>>8&0xff)) + (0.439*(float)(t&0xff)) + 128), \
+    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
+
+/* global constants */
+
+static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
+    { RGB_TO_YUV(0x00FF00) },   /* Green */
+    { RGB_TO_YUV(0xADFF2F) },   /* GreenYellow */
+    { RGB_TO_YUV(0x228B22) },   /* ForestGreen */
+    { RGB_TO_YUV(0x006400) },   /* DarkGreen */
+    { RGB_TO_YUV(0x98F5FF) },   /* Cadet Blue */
+    { RGB_TO_YUV(0x6CA6CD) },   /* Sky Blue */
+    { RGB_TO_YUV(0x00008B) },   /* Dark blue */
+    { RGB_TO_YUV(0x551A8B) },   /* Purple */
+    { RGB_TO_YUV(0xFF0000) }    /* Red */
+};
+
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x6633ff) },   /* Purple */
+    { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
+    { RGB_TO_YUV(0xff33cc) },   /* Pink */
+    { RGB_TO_YUV(0xff3366) },   /* Coral */
+    { RGB_TO_YUV(0x3366ff) },   /* Blue */
+    { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
+    { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
+    { RGB_TO_YUV(0xff6633) },   /* Orange */
+    { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
+    { RGB_TO_YUV(0x8ab800) },   /* Green */
+    { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
+    { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
+    { RGB_TO_YUV(0x66ff33) },   /* Light Green */
+    { RGB_TO_YUV(0xccff33) },   /* Yellow */
+};
+
+static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x00ff00) },   /* Blue */
+    { RGB_TO_YUV(0x0000ff) },   /* Green */
+    { RGB_TO_YUV(0xffff00) },   /* Yellow */
+    { RGB_TO_YUV(0xff0000) },   /* Red */
+};
 
 static const short kernel5[] =
 {
@@ -76,7 +122,7 @@ const short vp8_rv[] =
 
 
 extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
-
+extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
 /***********************************************************************************************************
  */
 void vp8_post_proc_down_and_across_c
@@ -101,7 +147,7 @@ void vp8_post_proc_down_and_across_c
 
     for (row = 0; row < rows; row++)
     {
-        // post_proc_down for one row
+        /* post_proc_down for one row */
         p_src = src_ptr;
         p_dst = dst_ptr;
 
@@ -124,7 +170,7 @@ void vp8_post_proc_down_and_across_c
             p_dst[col] = v;
         }
 
-        // now post_proc_across
+        /* now post_proc_across */
         p_src = dst_ptr;
         p_dst = dst_ptr;
 
@@ -153,12 +199,12 @@ void vp8_post_proc_down_and_across_c
                 p_dst[col-2] = d[(col-2)&7];
         }
 
-        //handle the last two pixels
+        /* handle the last two pixels */
         p_dst[col-2] = d[(col-2)&7];
         p_dst[col-1] = d[(col-1)&7];
 
 
-        //next row
+        /* next row */
         src_ptr += pitch;
         dst_ptr += pitch;
     }
@@ -351,9 +397,9 @@ static void fillrd(struct postproc_state *state, int q, int a)
 
     sigma = ai + .5 + .6 * (63 - qi) / 63.0;
 
-    // set up a lookup table of 256 entries that matches
-    // a gaussian distribution with sigma determined by q.
-    //
+    /* set up a lookup table of 256 entries that matches
+     * a gaussian distribution with sigma determined by q.
+     */
     {
         double i;
         int next, j;
@@ -444,6 +490,187 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
     }
 }
 
+/* Blend the macro block with a solid colored square.  Leave the
+ * edges unblended to give distinction to macro blocks in areas
+ * filled with the same color block.
+ */
+void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    y += 2*stride + 2;
+    for (i = 0; i < 12; i++)
+    {
+        for (j = 0; j < 12; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    u += stride + 1;
+    v += stride + 1;
+
+    for (i = 0; i < 6; i++)
+    {
+        for (j = 0; j < 6; j++)
+        {
+            u[j] = (u[j]*alpha + u1_const)>>16;
+            v[j] = (v[j]*alpha + v1_const)>>16;
+        }
+        u += stride;
+        v += stride;
+    }
+}
+
+/* Blend only the edge of the macro block.  Leave center
+ * unblended to allow for other visualizations to be layered.
+ */
+void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    for (i = 0; i < 12; i++)
+    {
+        y[0]  = (y[0]*alpha  + y1_const)>>16;
+        y[1]  = (y[1]*alpha  + y1_const)>>16;
+        y[14] = (y[14]*alpha + y1_const)>>16;
+        y[15] = (y[15]*alpha + y1_const)>>16;
+        y += stride;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+    u += stride;
+    v += stride;
+
+    for (i = 0; i < 6; i++)
+    {
+        u[0] = (u[0]*alpha + u1_const)>>16;
+        v[0] = (v[0]*alpha + v1_const)>>16;
+
+        u[7] = (u[7]*alpha + u1_const)>>16;
+        v[7] = (v[7]*alpha + v1_const)>>16;
+
+        u += stride;
+        v += stride;
+    }
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+}
+
+void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            u[j] = (u[j]*alpha + u1_const)>>16;
+            v[j] = (v[j]*alpha + v1_const)>>16;
+        }
+        u += stride;
+        v += stride;
+    }
+}
+
+static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
+{
+    int dx;
+    int dy;
+
+    if (*x1 > width)
+    {
+        dx = *x1 - x0;
+        dy = *y1 - y0;
+
+        *x1 = width;
+        if (dx)
+            *y1 = ((width-x0)*dy)/dx + y0;
+    }
+    if (*x1 < 0)
+    {
+        dx = *x1 - x0;
+        dy = *y1 - y0;
+
+        *x1 = 0;
+        if (dx)
+            *y1 = ((0-x0)*dy)/dx + y0;
+    }
+    if (*y1 > height)
+    {
+        dx = *x1 - x0;
+        dy = *y1 - y0;
+
+        *y1 = height;
+        if (dy)
+            *x1 = ((height-y0)*dx)/dy + x0;
+    }
+    if (*y1 < 0)
+    {
+        dx = *x1 - x0;
+        dy = *y1 - y0;
+
+        *y1 = 0;
+        if (dy)
+            *x1 = ((0-y0)*dx)/dy + x0;
+    }
+}
+
+
 #if CONFIG_RUNTIME_CPU_DETECT
 #define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
 #else
@@ -465,7 +692,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
     {
         *dest = *oci->frame_to_show;
 
-        // handle problem with extending borders
+        /* handle problem with extending borders */
         dest->y_width = oci->Width;
         dest->y_height = oci->Height;
         dest->uv_height = dest->y_height / 2;
@@ -521,7 +748,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
                 oci->mb_cols, oci->mb_rows);
         vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
     }
-    else if (flags & VP8D_DEBUG_LEVEL2)
+
+    if (flags & VP8D_DEBUG_LEVEL2)
     {
         int i, j;
         unsigned char *y_ptr;
@@ -533,7 +761,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
 
         y_ptr = post->y_buffer + 4 * post->y_stride + 4;
 
-        // vp8_filter each macro block
+        /* vp8_filter each macro block */
         for (i = 0; i < mb_rows; i++)
         {
             for (j = 0; j < mb_cols; j++)
@@ -547,12 +775,13 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
                 y_ptr += 16;
             }
 
-            mb_index ++; //border
+            mb_index ++; /* border */
             y_ptr += post->y_stride  * 16 - post->y_width;
 
         }
     }
-    else if (flags & VP8D_DEBUG_LEVEL3)
+
+    if (flags & VP8D_DEBUG_LEVEL3)
     {
         int i, j;
         unsigned char *y_ptr;
@@ -564,7 +793,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
 
         y_ptr = post->y_buffer + 4 * post->y_stride + 4;
 
-        // vp8_filter each macro block
+        /* vp8_filter each macro block */
         for (i = 0; i < mb_rows; i++)
         {
             for (j = 0; j < mb_cols; j++)
@@ -581,12 +810,13 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
                 y_ptr += 16;
             }
 
-            mb_index ++; //border
+            mb_index ++; /* border */
             y_ptr += post->y_stride  * 16 - post->y_width;
 
         }
     }
-    else if (flags & VP8D_DEBUG_LEVEL4)
+
+    if (flags & VP8D_DEBUG_LEVEL4)
     {
         sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
         vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
@@ -601,7 +831,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
 
         y_ptr = post->y_buffer + 4 * post->y_stride + 4;
 
-        // vp8_filter each macro block
+        /* vp8_filter each macro block */
         for (i = 0; i < mb_rows; i++)
         {
             for (j = 0; j < mb_cols; j++)
@@ -614,7 +844,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
                 y_ptr += 16;
             }
 
-            mb_index ++; //border
+            mb_index ++; /* border */
             y_ptr += post->y_stride  * 16 - post->y_width;
 
         }
@@ -623,11 +853,261 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
 
     }
 
+    /* Draw motion vectors */
+    if (flags & VP8D_DEBUG_DRAW_MV)
+    {
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        int mb_cols = width  >> 4;
+        unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+        int x0, y0;
+
+        for (y0 = 0; y0 < height; y0 += 16)
+        {
+            for (x0 = 0; x0 < width; x0 += 16)
+            {
+                int x1, y1;
+
+                if (mi->mbmi.mode == SPLITMV)
+                {
+                    switch (mi->mbmi.partitioning)
+                    {
+                        case 0 :    /* mv_top_bottom */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 1 :    /* mv_left_right */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 2 :    /* mv_quarters   */
+                        {
+                            B_MODE_INFO *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[10];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+12,  y1, y_buffer, y_stride);
+                            break;
+                        }
+                        default :
+                        {
+                            B_MODE_INFO *bmi = mi->bmi;
+                            int bx0, by0;
+
+                            for (by0 = y0; by0 < (y0+16); by0 += 4)
+                            {
+                                for (bx0 = x0; bx0 < (x0+16); bx0 += 4)
+                                {
+                                    MV *mv = &bmi->mv.as_mv;
 
+                                    x1 = bx0 + 2 + (mv->col >> 3);
+                                    y1 = by0 + 2 + (mv->row >> 3);
+
+                                    constrain_line (bx0+2, &x1, by0+2, &y1, width, height);
+                                    vp8_blit_line  (bx0+2,  x1, by0+2,  y1, y_buffer, y_stride);
+
+                                    bmi++;
+                                }
+                            }
+                        }
+                    }
+                }
+                else if (mi->mbmi.mode >= NEARESTMV)
+                {
+                    MV *mv = &mi->mbmi.mv.as_mv;
+                    const int lx0 = x0 + 8;
+                    const int ly0 = y0 + 8;
+
+                    x1 = lx0 + (mv->col >> 3);
+                    y1 = ly0 + (mv->row >> 3);
+
+                    if (x1 != lx0 && y1 != ly0)
+                    {
+                        constrain_line (lx0, &x1, ly0-1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0-1,  y1, y_buffer, y_stride);
+
+                        constrain_line (lx0, &x1, ly0+1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0+1,  y1, y_buffer, y_stride);
+                    }
+                    else
+                        vp8_blit_line  (lx0,  x1, ly0,  y1, y_buffer, y_stride);
+                }
+                mi++;
+            }
+            mi++;
+        }
+    }
+
+    /* Color in block modes */
+    if (flags & VP8D_DEBUG_CLR_BLK_MODES)
+    {
+        int y, x;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+        unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+        unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+
+        for (y = 0; y < height; y += 16)
+        {
+            for (x = 0; x < width; x += 16)
+            {
+                int Y = 0, U = 0, V = 0;
+
+                if (mi->mbmi.mode == B_PRED)
+                {
+                    int by, bx;
+                    unsigned char *yl, *ul, *vl;
+                    B_MODE_INFO *bmi = mi->bmi;
+
+                    yl = y_ptr + x;
+                    ul = u_ptr + (x>>1);
+                    vl = v_ptr + (x>>1);
+
+                    for (by = 0; by < 16; by += 4)
+                    {
+                        for (bx = 0; bx < 16; bx += 4)
+                        {
+                            Y = B_PREDICTION_MODE_colors[bmi->mode][0];
+                            U = B_PREDICTION_MODE_colors[bmi->mode][1];
+                            V = B_PREDICTION_MODE_colors[bmi->mode][2];
+
+                            POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
+                                (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+
+                            bmi++;
+                        }
+
+                        yl += y_stride*4;
+                        ul += y_stride*1;
+                        vl += y_stride*1;
+                    }
+                }
+                else
+                {
+                    Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+                    U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+                    V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+
+                    POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }
+                mi++;
+            }
+            y_ptr += y_stride*16;
+            u_ptr += y_stride*4;
+            v_ptr += y_stride*4;
+
+            mi++;
+        }
+    }
+
+    /* Color in frame reference blocks */
+    if (flags & VP8D_DEBUG_CLR_FRM_REF_BLKS)
+    {
+        int y, x;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+        unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+        unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+
+        for (y = 0; y < height; y += 16)
+        {
+            for (x = 0; x < width; x +=16)
+            {
+                int Y = 0, U = 0, V = 0;
+
+                Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+                U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+                V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+
+                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
+                    (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+
+                mi++;
+            }
+            y_ptr += y_stride*16;
+            u_ptr += y_stride*4;
+            v_ptr += y_stride*4;
+
+            mi++;
+        }
+    }
 
     *dest = oci->post_proc_buffer;
 
-    // handle problem with extending borders
+    /* handle problem with extending borders */
     dest->y_width = oci->Width;
     dest->y_height = oci->Height;
     dest->uv_height = dest->y_height / 2;
diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h
index 80337fc68..7485135bf 100644
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -24,6 +24,18 @@
               char whiteclamp[16], char bothclamp[16],\
               unsigned int w, unsigned int h, int pitch)
 
+#define prototype_postproc_blend_mb_inner(sym)\
+    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
+              int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_mb_outer(sym)\
+    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
+              int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_b(sym)\
+    void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
+              int y1, int u1, int v1, int alpha, int stride)
+
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/postproc_x86.h"
 #endif
@@ -48,16 +60,36 @@ extern prototype_postproc(vp8_postproc_downacross);
 #endif
 extern prototype_postproc_addnoise(vp8_postproc_addnoise);
 
+#ifndef vp8_postproc_blend_mb_inner
+#define vp8_postproc_blend_mb_inner vp8_blend_mb_inner_c
+#endif
+extern prototype_postproc_blend_mb_inner(vp8_postproc_blend_mb_inner);
+
+#ifndef vp8_postproc_blend_mb_outer
+#define vp8_postproc_blend_mb_outer vp8_blend_mb_outer_c
+#endif
+extern prototype_postproc_blend_mb_outer(vp8_postproc_blend_mb_outer);
+
+#ifndef vp8_postproc_blend_b
+#define vp8_postproc_blend_b vp8_blend_b_c
+#endif
+extern prototype_postproc_blend_b(vp8_postproc_blend_b);
 
 typedef prototype_postproc((*vp8_postproc_fn_t));
 typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t));
 typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t));
+typedef prototype_postproc_blend_mb_inner((*vp8_postproc_blend_mb_inner_fn_t));
+typedef prototype_postproc_blend_mb_outer((*vp8_postproc_blend_mb_outer_fn_t));
+typedef prototype_postproc_blend_b((*vp8_postproc_blend_b_fn_t));
 typedef struct
 {
-    vp8_postproc_inplace_fn_t   down;
-    vp8_postproc_inplace_fn_t   across;
-    vp8_postproc_fn_t           downacross;
-    vp8_postproc_addnoise_fn_t  addnoise;
+    vp8_postproc_inplace_fn_t           down;
+    vp8_postproc_inplace_fn_t           across;
+    vp8_postproc_fn_t                   downacross;
+    vp8_postproc_addnoise_fn_t          addnoise;
+    vp8_postproc_blend_mb_inner_fn_t    blend_mb_inner;
+    vp8_postproc_blend_mb_outer_fn_t    blend_mb_outer;
+    vp8_postproc_blend_b_fn_t           blend_b;
 } vp8_postproc_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/common/ppflags.h b/vp8/common/ppflags.h
index b1f925c44..b8d713cf0 100644
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -13,14 +13,17 @@
 #define __INC_PPFLAGS_H
 enum
 {
-    VP8D_NOFILTERING    = 0,
-    VP8D_DEBLOCK        = 1,
-    VP8D_DEMACROBLOCK   = 2,
-    VP8D_ADDNOISE       = 4,
-    VP8D_DEBUG_LEVEL1   = 8,
-    VP8D_DEBUG_LEVEL2   = 16,
-    VP8D_DEBUG_LEVEL3   = 32,
-    VP8D_DEBUG_LEVEL4   = 64,
+    VP8D_NOFILTERING            = 0,
+    VP8D_DEBLOCK                = 1<<0,
+    VP8D_DEMACROBLOCK           = 1<<1,
+    VP8D_ADDNOISE               = 1<<2,
+    VP8D_DEBUG_LEVEL1           = 1<<3,
+    VP8D_DEBUG_LEVEL2           = 1<<4,
+    VP8D_DEBUG_LEVEL3           = 1<<5,
+    VP8D_DEBUG_LEVEL4           = 1<<6,
+    VP8D_DEBUG_DRAW_MV          = 1<<7,
+    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
 };
 
 #endif
diff --git a/vp8/common/recon.c b/vp8/common/recon.c
index 0b439e054..d72d6e410 100644
--- a/vp8/common/recon.c
+++ b/vp8/common/recon.c
@@ -106,8 +106,24 @@ void vp8_recon2b_c
     }
 }
 
-void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
+#if ARCH_ARM
+    BLOCKD *b = &x->block[0];
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    /*b = &x->block[4];*/
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    /*b = &x->block[8];*/
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    /*b = &x->block[12];*/
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+#else
     int i;
 
     for (i = 0; i < 16; i += 4)
@@ -116,10 +132,36 @@ void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 
         RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
     }
+#endif
 }
 
-void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+void vp8_recon_mb_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
+#if ARCH_ARM
+    BLOCKD *b = &x->block[0];
+
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+
+    /*b = &x->block[16];*/
+
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+#else
     int i;
 
     for (i = 0; i < 16; i += 4)
@@ -135,4 +177,5 @@ void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 
         RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
     }
+#endif
 }
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index e34a63c86..1e6e343fc 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -12,11 +12,18 @@
 #ifndef __INC_RECON_H
 #define __INC_RECON_H
 
+#include "blockd.h"
+
 #define prototype_copy_block(sym) \
     void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch)
 
 #define prototype_recon_block(sym) \
-    void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch);
+    void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch)
+
+#define prototype_recon_macroblock(sym) \
+    void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)
+
+struct vp8_recon_rtcd_vtable;
 
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/recon_x86.h"
@@ -56,9 +63,20 @@ extern prototype_recon_block(vp8_recon_recon2);
 #endif
 extern prototype_recon_block(vp8_recon_recon4);
 
+#ifndef vp8_recon_recon_mb
+#define vp8_recon_recon_mb vp8_recon_mb_c
+#endif
+extern prototype_recon_macroblock(vp8_recon_recon_mb);
+
+#ifndef vp8_recon_recon_mby
+#define vp8_recon_recon_mby vp8_recon_mby_c
+#endif
+extern prototype_recon_macroblock(vp8_recon_recon_mby);
+
 typedef prototype_copy_block((*vp8_copy_block_fn_t));
 typedef prototype_recon_block((*vp8_recon_fn_t));
-typedef struct
+typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
+typedef struct vp8_recon_rtcd_vtable
 {
     vp8_copy_block_fn_t  copy16x16;
     vp8_copy_block_fn_t  copy8x8;
@@ -66,6 +84,8 @@ typedef struct
     vp8_recon_fn_t       recon;
     vp8_recon_fn_t       recon2;
     vp8_recon_fn_t       recon4;
+    vp8_recon_mb_fn_t    recon_mb;
+    vp8_recon_mb_fn_t    recon_mby;
 } vp8_recon_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -74,9 +94,6 @@ typedef struct
 #define RECON_INVOKE(ctx,fn) vp8_recon_##fn
 #endif
 
-#include "blockd.h"
-void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 #endif
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index ffdc660c2..74871c0e8 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -18,9 +18,10 @@
 #include "onyxc_int.h"
 #endif
 
-// use this define on systems where unaligned int reads and writes are
-// not allowed, i.e. ARM architectures
-//#define MUST_BE_ALIGNED
+/* use this define on systems where unaligned int reads and writes are
+ * not allowed, i.e. ARM architectures
+ */
+/*#define MUST_BE_ALIGNED*/
 
 
 static const int bbb[4] = {0, 2, 8, 10};
@@ -255,7 +256,7 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
     }
 }
 
-//encoder only
+/*encoder only*/
 void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
 {
 
@@ -491,15 +492,16 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
 }
 
 
-// The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this
-// situation, we can write the result directly to dst buffer instead of writing it to predictor
-// buffer and then copying it to dst buffer.
+/* The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this
+ * situation, we can write the result directly to dst buffer instead of writing it to predictor
+ * buffer and then copying it to dst buffer.
+ */
 static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp8_subpix_fn_t sppf)
 {
     int r;
     unsigned char *ptr_base;
     unsigned char *ptr;
-    //unsigned char *pred_ptr = d->predictor;
+    /*unsigned char *pred_ptr = d->predictor;*/
     int dst_stride = d->dst_stride;
     int pre_stride = d->pre_stride;
 
@@ -535,8 +537,8 @@ static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp
 
 void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
 {
-    //unsigned char *pred_ptr = x->block[0].predictor;
-    //unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;
+    /*unsigned char *pred_ptr = x->block[0].predictor;
+    unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;*/
     unsigned char *pred_ptr = x->predictor;
     unsigned char *dst_ptr = x->dst.y_buffer;
 
@@ -546,26 +548,26 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
         unsigned char *ptr_base;
         unsigned char *ptr;
         unsigned char *uptr, *vptr;
-        //unsigned char *pred_ptr = x->predictor;
-        //unsigned char *upred_ptr = &x->predictor[256];
-        //unsigned char *vpred_ptr = &x->predictor[320];
+        /*unsigned char *pred_ptr = x->predictor;
+        unsigned char *upred_ptr = &x->predictor[256];
+        unsigned char *vpred_ptr = &x->predictor[320];*/
         unsigned char *udst_ptr = x->dst.u_buffer;
         unsigned char *vdst_ptr = x->dst.v_buffer;
 
         int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
         int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
-        int pre_stride = x->dst.y_stride; //x->block[0].pre_stride;
+        int pre_stride = x->dst.y_stride; /*x->block[0].pre_stride;*/
 
         ptr_base = x->pre.y_buffer;
         ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
 
         if ((mv_row | mv_col) & 7)
         {
-            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride);
+            x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
         }
         else
         {
-            RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride);
+            RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
         }
 
         mv_row = x->block[16].bmi.mv.as_mv.row;
@@ -588,8 +590,9 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
     }
     else
     {
-        //note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
-        //if sth is wrong, go back to what it is in build_inter_predictors_mb.
+        /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
+         * if sth is wrong, go back to what it is in build_inter_predictors_mb.
+         */
         int i;
 
         if (x->mode_info_context->mbmi.partitioning < 3)
@@ -597,7 +600,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
             for (i = 0; i < 4; i++)
             {
                 BLOCKD *d = &x->block[bbb[i]];
-                //vp8_build_inter_predictors4b(x, d, 16);
+                /*vp8_build_inter_predictors4b(x, d, 16);*/
 
                 {
                     unsigned char *ptr_base;
@@ -609,11 +612,11 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
 
                     if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
                     {
-                        x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride);
+                        x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
                     }
                     else
                     {
-                        RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); //x->block[0].dst_stride);
+                        RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
                     }
                 }
             }
@@ -627,7 +630,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
 
                 if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
                 {
-                    //vp8_build_inter_predictors2b(x, d0, 16);
+                    /*vp8_build_inter_predictors2b(x, d0, 16);*/
                     unsigned char *ptr_base;
                     unsigned char *ptr;
                     unsigned char *pred_ptr = d0->predictor;
@@ -659,7 +662,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
 
             if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
             {
-                //vp8_build_inter_predictors2b(x, d0, 8);
+                /*vp8_build_inter_predictors2b(x, d0, 8);*/
                 unsigned char *ptr_base;
                 unsigned char *ptr;
                 unsigned char *pred_ptr = d0->predictor;
@@ -669,11 +672,15 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
 
                 if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
                 {
-                    x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride);
+                    x->subpixel_predict8x4(ptr, d0->pre_stride,
+                        d0->bmi.mv.as_mv.col & 7,
+                        d0->bmi.mv.as_mv.row & 7,
+                        dst_ptr, x->dst.uv_stride);
                 }
                 else
                 {
-                    RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride);
+                    RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr,
+                        d0->pre_stride, dst_ptr, x->dst.uv_stride);
                 }
             }
             else
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index ce0b1b8ec..9cf5f6a88 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -14,9 +14,9 @@
 #include "reconintra.h"
 #include "vpx_mem/vpx_mem.h"
 
-// For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
-// vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
-
+/* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
+ * vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
+ */
 void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
     int i;
@@ -42,7 +42,7 @@ void vp8_build_intra_predictors_mby(MACROBLOCKD *x)
         yleft_col[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
     }
 
-    // for Y
+    /* for Y */
     switch (x->mode_info_context->mbmi.mode)
     {
     case DC_PRED:
@@ -156,14 +156,14 @@ void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
     int r, c, i;
 
     int y_stride = x->dst.y_stride;
-    ypred_ptr = x->dst.y_buffer; //x->predictor;
+    ypred_ptr = x->dst.y_buffer; /*x->predictor;*/
 
     for (i = 0; i < 16; i++)
     {
         yleft_col[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
     }
 
-    // for Y
+    /* for Y */
     switch (x->mode_info_context->mbmi.mode)
     {
     case DC_PRED:
@@ -204,11 +204,11 @@ void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
             expected_dc = 128;
         }
 
-        //vpx_memset(ypred_ptr, expected_dc, 256);
+        /*vpx_memset(ypred_ptr, expected_dc, 256);*/
         for (r = 0; r < 16; r++)
         {
             vpx_memset(ypred_ptr, expected_dc, 16);
-            ypred_ptr += y_stride; //16;
+            ypred_ptr += y_stride; /*16;*/
         }
     }
     break;
@@ -222,7 +222,7 @@ void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
             ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
             ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
             ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
-            ypred_ptr += y_stride; //16;
+            ypred_ptr += y_stride; /*16;*/
         }
     }
     break;
@@ -233,7 +233,7 @@ void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
         {
 
             vpx_memset(ypred_ptr, yleft_col[r], 16);
-            ypred_ptr += y_stride;  //16;
+            ypred_ptr += y_stride;  /*16;*/
         }
 
     }
@@ -256,7 +256,7 @@ void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x)
                 ypred_ptr[c] = pred;
             }
 
-            ypred_ptr += y_stride;  //16;
+            ypred_ptr += y_stride;  /*16;*/
         }
 
     }
@@ -418,8 +418,8 @@ void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x)
     unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
     unsigned char vleft_col[20];
     unsigned char vtop_left = vabove_row[-1];
-    unsigned char *upred_ptr = x->dst.u_buffer; //&x->predictor[256];
-    unsigned char *vpred_ptr = x->dst.v_buffer; //&x->predictor[320];
+    unsigned char *upred_ptr = x->dst.u_buffer; /*&x->predictor[256];*/
+    unsigned char *vpred_ptr = x->dst.v_buffer; /*&x->predictor[320];*/
     int uv_stride = x->dst.uv_stride;
 
     int i, j;
@@ -472,14 +472,14 @@ void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x)
         }
 
 
-        //vpx_memset(upred_ptr,expected_udc,64);
-        //vpx_memset(vpred_ptr,expected_vdc,64);
+        /*vpx_memset(upred_ptr,expected_udc,64);*/
+        /*vpx_memset(vpred_ptr,expected_vdc,64);*/
         for (i = 0; i < 8; i++)
         {
             vpx_memset(upred_ptr, expected_udc, 8);
             vpx_memset(vpred_ptr, expected_vdc, 8);
-            upred_ptr += uv_stride; //8;
-            vpred_ptr += uv_stride; //8;
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
         }
     }
     break;
@@ -491,8 +491,8 @@ void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x)
         {
             vpx_memcpy(upred_ptr, uabove_row, 8);
             vpx_memcpy(vpred_ptr, vabove_row, 8);
-            upred_ptr += uv_stride; //8;
-            vpred_ptr += uv_stride; //8;
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
         }
 
     }
@@ -505,8 +505,8 @@ void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x)
         {
             vpx_memset(upred_ptr, uleft_col[i], 8);
             vpx_memset(vpred_ptr, vleft_col[i], 8);
-            upred_ptr += uv_stride; //8;
-            vpred_ptr += uv_stride; //8;
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
         }
     }
 
@@ -538,8 +538,8 @@ void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x)
                 vpred_ptr[j] = predv;
             }
 
-            upred_ptr += uv_stride; //8;
-            vpred_ptr += uv_stride; //8;
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
         }
 
     }
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index c6e5fe7fd..db44fa190 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -56,7 +56,7 @@ void vp8_predict_intra4x4(BLOCKD *x,
     break;
     case B_TM_PRED:
     {
-        // prediction similar to true_motion prediction
+        /* prediction similar to true_motion prediction */
         for (r = 0; r < 4; r++)
         {
             for (c = 0; c < 4; c++)
@@ -295,8 +295,9 @@ void vp8_predict_intra4x4(BLOCKD *x,
 
     }
 }
-// copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
-// to the right prediction have filled in pixels to use.
+/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
+ * to the right prediction have filled in pixels to use.
+ */
 void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
 {
     unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
@@ -318,6 +319,74 @@ void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 
     vp8_intra_prediction_down_copy(x);
 
+#if ARCH_ARM
+    {
+        BLOCKD *b = &x->block[0];
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+#else
     for (i = 0; i < 16; i++)
     {
         BLOCKD *b = &x->block[i];
@@ -325,6 +394,7 @@ void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
         vp8_predict_intra4x4(b, x->block[i].bmi.mode, x->block[i].predictor);
         RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
     }
+#endif
 
     vp8_recon_intra_mbuv(rtcd, x);
 
diff --git a/vp8/common/setupintrarecon.c b/vp8/common/setupintrarecon.c
index 8647ae2aa..7976e252b 100644
--- a/vp8/common/setupintrarecon.c
+++ b/vp8/common/setupintrarecon.c
@@ -16,7 +16,7 @@ void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
 {
     int i;
 
-    // set up frame new frame for intra coded blocks
+    /* set up frame new frame for intra coded blocks */
     vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
     for (i = 0; i < ybf->y_height; i++)
         ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
diff --git a/vp8/common/textblit.c b/vp8/common/textblit.c
index da40f9352..1756100a7 100644
--- a/vp8/common/textblit.c
+++ b/vp8/common/textblit.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
+#include <stdlib.h>
 
 
 void vp8_blit_text(const char *msg, unsigned char *address, const int pitch)
@@ -51,3 +51,80 @@ void vp8_blit_text(const char *msg, unsigned char *address, const int pitch)
         colpos++;
     }
 }
+
+static void plot (const int x, const int y, unsigned char *image, const int pitch)
+{
+    image [x+y*pitch] ^= 255;
+}
+
+/* Bresenham line algorithm */
+void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch)
+{
+    int steep = abs(y1 - y0) > abs(x1 - x0);
+    int deltax, deltay;
+    int error, ystep, y, x;
+
+    if (steep)
+    {
+        int t;
+        t = x0;
+        x0 = y0;
+        y0 = t;
+
+        t = x1;
+        x1 = y1;
+        y1 = t;
+    }
+
+    if (x0 > x1)
+    {
+        int t;
+        t = x0;
+        x0 = x1;
+        x1 = t;
+
+        t = y0;
+        y0 = y1;
+        y1 = t;
+    }
+
+    deltax = x1 - x0;
+    deltay = abs(y1 - y0);
+    error  = deltax / 2;
+
+    y = y0;
+
+    if (y0 < y1)
+        ystep = 1;
+    else
+        ystep = -1;
+
+    if (steep)
+    {
+        for (x = x0; x <= x1; x++)
+        {
+            plot(y,x, image, pitch);
+
+            error = error - deltay;
+            if (error < 0)
+            {
+                y = y + ystep;
+                error = error + deltax;
+            }
+        }
+    }
+    else
+    {
+        for (x = x0; x <= x1; x++)
+        {
+            plot(x,y, image, pitch);
+
+            error = error - deltay;
+            if (error < 0)
+            {
+                y = y + ystep;
+                error = error + deltax;
+            }
+        }
+    }
+}
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index f9a257460..1929f7c4f 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -12,7 +12,7 @@
 #ifndef _PTHREAD_EMULATION
 #define _PTHREAD_EMULATION
 
-#define VPXINFINITE 10000       //10second.
+#define VPXINFINITE 10000       /* 10second. */
 
 /* Thread management macros */
 #ifdef _WIN32
@@ -72,11 +72,11 @@
 #define sem_wait(sem) (semaphore_wait(*sem) )
 #define sem_post(sem) semaphore_signal(*sem)
 #define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem)
-#define thread_sleep(nms) // { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);}
+#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
 #else
 #include <unistd.h>
 #include <sched.h>
-#define thread_sleep(nms) sched_yield();// {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);}
+#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
 #endif
 /* Not Windows. Assume pthreads */
 
diff --git a/vp8/common/treecoder.h b/vp8/common/treecoder.h
index 35e5be10c..ebf51c5ed 100644
--- a/vp8/common/treecoder.h
+++ b/vp8/common/treecoder.h
@@ -12,7 +12,7 @@
 #ifndef __INC_TREECODER_H
 #define __INC_TREECODER_H
 
-typedef unsigned char vp8bc_index_t; // probability index
+typedef unsigned char vp8bc_index_t; /* probability index */
 
 
 typedef unsigned char vp8_prob;
diff --git a/vp8/common/type_aliases.h b/vp8/common/type_aliases.h
index f2a370298..22b531a76 100644
--- a/vp8/common/type_aliases.h
+++ b/vp8/common/type_aliases.h
@@ -64,32 +64,32 @@ typedef signed char     INT8;
 #endif
 
 #ifndef TYPE_INT16
-//#define TYPE_INT16
+/*#define TYPE_INT16*/
 typedef signed short    INT16;
 #endif
 
 #ifndef TYPE_INT32
-//#define TYPE_INT32
+/*#define TYPE_INT32*/
 typedef signed int      INT32;
 #endif
 
 #ifndef TYPE_UINT8
-//#define TYPE_UINT8
+/*#define TYPE_UINT8*/
 typedef unsigned char   UINT8;
 #endif
 
 #ifndef TYPE_UINT32
-//#define TYPE_UINT32
+/*#define TYPE_UINT32*/
 typedef unsigned int    UINT32;
 #endif
 
 #ifndef TYPE_UINT16
-//#define TYPE_UINT16
+/*#define TYPE_UINT16*/
 typedef unsigned short  UINT16;
 #endif
 
 #ifndef TYPE_BOOL
-//#define TYPE_BOOL
+/*#define TYPE_BOOL*/
 typedef int             BOOL;
 #endif
 
@@ -101,7 +101,7 @@ typedef __int64 INT64;
 
 #ifndef TYPE_INT64
 #ifdef _TMS320C6X
-//for now we only have 40bits
+/* for now we only have 40bits */
 typedef long INT64;
 #else
 typedef long long INT64;
diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm
index 99e09a50e..43735bc4b 100644
--- a/vp8/common/x86/idctllm_mmx.asm
+++ b/vp8/common/x86/idctllm_mmx.asm
@@ -58,11 +58,11 @@ sym(vp8_short_idct4x4llm_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL]        ;
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)]       ;
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL]    ;
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)]   ;
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -70,10 +70,10 @@ sym(vp8_short_idct4x4llm_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
@@ -113,11 +113,11 @@ sym(vp8_short_idct4x4llm_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL]         ;
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)]        ;
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL]    ;
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)]   ;
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -125,16 +125,16 @@ sym(vp8_short_idct4x4llm_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
-        paddw       mm0,            [fours GLOBAL]
+        paddw       mm0,            [GLOBAL(fours)]
 
-        paddw       mm2,            [fours GLOBAL]
+        paddw       mm2,            [GLOBAL(fours)]
         movq        mm6,            mm2             ; a1
 
         movq        mm4,            mm0             ; b1
@@ -196,7 +196,7 @@ sym(vp8_short_idct4x4llm_1_mmx):
         mov         rax,            arg(0) ;input
         movd        mm0,            [rax]
 
-        paddw       mm0,            [fours GLOBAL]
+        paddw       mm0,            [GLOBAL(fours)]
         mov         rdx,            arg(1) ;output
 
         psraw       mm0,            3
@@ -239,7 +239,7 @@ sym(vp8_dc_only_idct_add_mmx):
 
         movd        mm5,            arg(0) ;input_dc
 
-        paddw       mm5,            [fours GLOBAL]
+        paddw       mm5,            [GLOBAL(fours)]
 
         psraw       mm5,            3
 
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index ac941851b..edee1578e 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -51,7 +51,7 @@ sym(idct_dequant_0_2x_sse2):
         pshufhw     xmm4,           xmm4,       00000000b
 
         mov         rax,            arg(2) ; pre
-        paddw       xmm4,           [fours GLOBAL]
+        paddw       xmm4,           [GLOBAL(fours)]
 
         movsxd      rcx,            dword ptr arg(5) ; blk_stride
         psraw       xmm4,           3
@@ -160,11 +160,11 @@ sym(idct_dequant_full_2x_sse2):
         movdqa      xmm5,           xmm1
         paddw       xmm2,           xmm0        ; a1 = 0+2
 
-        pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
         paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
 
         movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
 
         paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
         psubw       xmm7,           xmm5        ; c1
@@ -172,10 +172,10 @@ sym(idct_dequant_full_2x_sse2):
         movdqa      xmm5,           xmm1
         movdqa      xmm4,           xmm3
 
-        pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
         paddw       xmm5,           xmm1
 
-        pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
         paddw       xmm3,           xmm4
 
         paddw       xmm3,           xmm5        ; d1
@@ -229,11 +229,11 @@ sym(idct_dequant_full_2x_sse2):
         movdqa      xmm5,           xmm1
         paddw       xmm2,           xmm0            ; a1 = 0+2
 
-        pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
         paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
 
         movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
 
         paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
         psubw       xmm7,           xmm5            ; c1
@@ -241,16 +241,16 @@ sym(idct_dequant_full_2x_sse2):
         movdqa      xmm5,           xmm1
         movdqa      xmm4,           xmm3
 
-        pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
         paddw       xmm5,           xmm1
 
-        pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
         paddw       xmm3,           xmm4
 
         paddw       xmm3,           xmm5            ; d1
-        paddw       xmm0,           [fours GLOBAL]
+        paddw       xmm0,           [GLOBAL(fours)]
 
-        paddw       xmm2,           [fours GLOBAL]
+        paddw       xmm2,           [GLOBAL(fours)]
         movdqa      xmm6,           xmm2            ; a1
 
         movdqa      xmm4,           xmm0            ; b1
@@ -394,7 +394,7 @@ sym(idct_dequant_dc_0_2x_sse2):
         punpckldq   xmm4,           xmm4
 
     ; Rounding to dequant and downshift
-        paddw       xmm4,           [fours GLOBAL]
+        paddw       xmm4,           [GLOBAL(fours)]
         psraw       xmm4,           3
 
     ; Predict buffer needs to be expanded from bytes to words
@@ -505,11 +505,11 @@ sym(idct_dequant_dc_full_2x_sse2):
         movdqa      xmm5,           xmm1
         paddw       xmm2,           xmm0        ; a1 = 0+2
 
-        pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
         paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
 
         movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
 
         paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
         psubw       xmm7,           xmm5        ; c1
@@ -517,10 +517,10 @@ sym(idct_dequant_dc_full_2x_sse2):
         movdqa      xmm5,           xmm1
         movdqa      xmm4,           xmm3
 
-        pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
         paddw       xmm5,           xmm1
 
-        pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
         paddw       xmm3,           xmm4
 
         paddw       xmm3,           xmm5        ; d1
@@ -574,11 +574,11 @@ sym(idct_dequant_dc_full_2x_sse2):
         movdqa      xmm5,           xmm1
         paddw       xmm2,           xmm0            ; a1 = 0+2
 
-        pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
         paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
 
         movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
 
         paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
         psubw       xmm7,           xmm5            ; c1
@@ -586,16 +586,16 @@ sym(idct_dequant_dc_full_2x_sse2):
         movdqa      xmm5,           xmm1
         movdqa      xmm4,           xmm3
 
-        pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
         paddw       xmm5,           xmm1
 
-        pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
         paddw       xmm3,           xmm4
 
         paddw       xmm3,           xmm5            ; d1
-        paddw       xmm0,           [fours GLOBAL]
+        paddw       xmm0,           [GLOBAL(fours)]
 
-        paddw       xmm2,           [fours GLOBAL]
+        paddw       xmm2,           [GLOBAL(fours)]
         movdqa      xmm6,           xmm2            ; a1
 
         movdqa      xmm4,           xmm0            ; b1
diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm
index 3f0671c58..10b5274dc 100644
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -69,7 +69,7 @@ sym(vp8_short_inv_walsh4x4_mmx):
     movq    mm2, [rsi + 16]       ;ip[8]
     movq    mm3, [rsi + 24]       ;ip[12]
 
-    movd    mm7, rax
+    movq    mm7, rax
     movq    mm4, mm0
 
     punpcklwd mm7, mm7          ;0003000300030003h
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index 0b39e627d..c6c215c3c 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -111,7 +111,7 @@ next8_h:
         psubusb     mm3, mm2              ; q1-=p1
         psubusb     mm2, mm4              ; p1-=q1
         por         mm2, mm3              ; abs(p1-q1)
-        pand        mm2, [tfe GLOBAL]     ; set lsb of each byte to zero
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
         psrlw       mm2, 1                ; abs(p1-q1)/2
 
         movq        mm6, mm5              ; p0
@@ -150,12 +150,12 @@ next8_h:
         ; start work on filters
         movq        mm2, [rsi+2*rax]      ; p1
         movq        mm7, [rdi]            ; q1
-        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
-        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
         psubsb      mm2, mm7              ; p1 - q1
         pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
-        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
-        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
         movq        mm3, mm0              ; q0
         psubsb      mm0, mm6              ; q0 - p0
         paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
@@ -163,8 +163,8 @@ next8_h:
         paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
         pand        mm1, mm2                  ; mask filter values we don't care about
         movq        mm2, mm1
-        paddsb      mm1, [t4 GLOBAL]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      mm2, [t3 GLOBAL]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
 
         pxor        mm0, mm0             ;
         pxor        mm5, mm5
@@ -185,29 +185,29 @@ next8_h:
         movq        mm5, mm0              ; save results
 
         packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5, [ones GLOBAL]
-        paddsw      mm1, [ones GLOBAL]
+        paddsw      mm5, [GLOBAL(ones)]
+        paddsw      mm1, [GLOBAL(ones)]
         psraw       mm5, 1                ; partial shifted one more time for 2nd tap
         psraw       mm1, 1                ; partial shifted one more time for 2nd tap
         packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
         pandn       mm4, mm5              ; high edge variance additive
 
         paddsb      mm6, mm2              ; p0+= p0 add
-        pxor        mm6, [t80 GLOBAL]     ; unoffset
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
         movq        [rsi+rax], mm6        ; write back
 
         movq        mm6, [rsi+2*rax]      ; p1
-        pxor        mm6, [t80 GLOBAL]     ; reoffset
+        pxor        mm6, [GLOBAL(t80)]    ; reoffset
         paddsb      mm6, mm4              ; p1+= p1 add
-        pxor        mm6, [t80 GLOBAL]     ; unoffset
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
         movq        [rsi+2*rax], mm6      ; write back
 
         psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [t80 GLOBAL]     ; unoffset
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
         movq        [rsi], mm3            ; write back
 
         psubsb      mm7, mm4              ; q1-= q1 add
-        pxor        mm7, [t80 GLOBAL]     ; unoffset
+        pxor        mm7, [GLOBAL(t80)]    ; unoffset
         movq        [rdi], mm7            ; write back
 
         add         rsi,8
@@ -403,7 +403,7 @@ next8_v:
         psubusb     mm5,        mm1                         ; q1-=p1
         psubusb     mm1,        mm2                         ; p1-=q1
         por         mm5,        mm1                         ; abs(p1-q1)
-        pand        mm5,        [tfe GLOBAL]                ; set lsb of each byte to zero
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
         psrlw       mm5,        1                           ; abs(p1-q1)/2
 
         mov         rdx,        arg(2) ;flimit                      ;
@@ -455,14 +455,14 @@ next8_v:
         movq        mm6,        [rdx+8]         ; p0
         movq        mm0,        [rdx+16]        ; q0
 
-        pxor        mm2,        [t80 GLOBAL]    ; p1 offset to convert to signed values
-        pxor        mm7,        [t80 GLOBAL]    ; q1 offset to convert to signed values
+        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
+        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
 
         psubsb      mm2,        mm7             ; p1 - q1
         pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
 
-        pxor        mm6,        [t80 GLOBAL]    ; offset to convert to signed values
-        pxor        mm0,        [t80 GLOBAL]    ; offset to convert to signed values
+        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
+        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
 
         movq        mm3,        mm0             ; q0
         psubsb      mm0,        mm6             ; q0 - p0
@@ -474,9 +474,9 @@ next8_v:
         pand       mm1,        mm2              ; mask filter values we don't care about
 
         movq        mm2,        mm1
-        paddsb      mm1,        [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
 
-        paddsb      mm2,        [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
         pxor        mm0,        mm0          ;
 
         pxor        mm5,        mm5
@@ -503,9 +503,9 @@ next8_v:
         movq        mm5,        mm0              ; save results
 
         packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5,        [ones GLOBAL]
+        paddsw      mm5,        [GLOBAL(ones)]
 
-        paddsw      mm1,        [ones GLOBAL]
+        paddsw      mm1,        [GLOBAL(ones)]
         psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
 
         psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
@@ -514,22 +514,22 @@ next8_v:
         pandn       mm4,        mm5             ; high edge variance additive
 
         paddsb      mm6,        mm2             ; p0+= p0 add
-        pxor        mm6,        [t80 GLOBAL]    ; unoffset
+        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
 
         ; mm6=p0                               ;
         movq        mm1,        [rdx]           ; p1
-        pxor        mm1,        [t80 GLOBAL]    ; reoffset
+        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
 
         paddsb      mm1,        mm4                 ; p1+= p1 add
-        pxor        mm1,        [t80 GLOBAL]        ; unoffset
+        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
         ; mm6 = p0 mm1 = p1
 
         psubsb      mm3,        mm0                 ; q0-= q0 add
-        pxor        mm3,        [t80 GLOBAL]        ; unoffset
+        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
 
         ; mm3 = q0
         psubsb      mm7,        mm4                 ; q1-= q1 add
-        pxor        mm7,        [t80 GLOBAL]        ; unoffset
+        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
         ; mm7 = q1
 
         ; tranpose and write back
@@ -708,7 +708,7 @@ next8_mbh:
         psubusb     mm3, mm2              ; q1-=p1
         psubusb     mm2, mm4              ; p1-=q1
         por         mm2, mm3              ; abs(p1-q1)
-        pand        mm2, [tfe GLOBAL]     ; set lsb of each byte to zero
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
         psrlw       mm2, 1                ; abs(p1-q1)/2
 
         movq        mm6, mm5              ; p0
@@ -753,12 +753,12 @@ next8_mbh:
         ; start work on filters
         movq        mm2, [rsi+2*rax]      ; p1
         movq        mm7, [rdi]            ; q1
-        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
-        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
         psubsb      mm2, mm7              ; p1 - q1
 
-        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
-        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
         movq        mm3, mm0              ; q0
         psubsb      mm0, mm6              ; q0 - p0
         paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
@@ -772,7 +772,7 @@ next8_mbh:
         pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
 
         movq        mm5,        mm2       ;
-        paddsb      mm5,        [t3 GLOBAL];
+        paddsb      mm5,        [GLOBAL(t3)];
 
         pxor        mm0, mm0              ; 0
         pxor        mm7, mm7              ; 0
@@ -785,7 +785,7 @@ next8_mbh:
 
         movq        mm5, mm0              ; Filter2
 
-        paddsb      mm2, [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
+        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
         pxor        mm0, mm0              ; 0
         pxor        mm7, mm7              ; 0
 
@@ -818,10 +818,10 @@ next8_mbh:
         pxor        mm2, mm2
         punpcklbw   mm1, mm4
         punpckhbw   mm2, mm4
-        pmulhw      mm1, [s27 GLOBAL]
-        pmulhw      mm2, [s27 GLOBAL]
-        paddw       mm1, [s63 GLOBAL]
-        paddw       mm2, [s63 GLOBAL]
+        pmulhw      mm1, [GLOBAL(s27)]
+        pmulhw      mm2, [GLOBAL(s27)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
         psraw       mm1, 7
         psraw       mm2, 7
         packsswb    mm1, mm2
@@ -829,8 +829,8 @@ next8_mbh:
         psubsb      mm3, mm1
         paddsb      mm6, mm1
 
-        pxor        mm3, [t80 GLOBAL]
-        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
         movq        [rsi+rax], mm6
         movq        [rsi],     mm3
 
@@ -844,10 +844,10 @@ next8_mbh:
         pxor        mm2, mm2
         punpcklbw   mm1, mm4
         punpckhbw   mm2, mm4
-        pmulhw      mm1, [s18 GLOBAL]
-        pmulhw      mm2, [s18 GLOBAL]
-        paddw       mm1, [s63 GLOBAL]
-        paddw       mm2, [s63 GLOBAL]
+        pmulhw      mm1, [GLOBAL(s18)]
+        pmulhw      mm2, [GLOBAL(s18)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
         psraw       mm1, 7
         psraw       mm2, 7
         packsswb    mm1, mm2
@@ -855,14 +855,14 @@ next8_mbh:
         movq        mm3, [rdi]
         movq        mm6, [rsi+rax*2]       ; p1
 
-        pxor        mm3, [t80 GLOBAL]
-        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
 
         paddsb      mm6, mm1
         psubsb      mm3, mm1
 
-        pxor        mm6, [t80 GLOBAL]
-        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
         movq        [rdi], mm3
         movq        [rsi+rax*2], mm6
 
@@ -876,10 +876,10 @@ next8_mbh:
         pxor        mm2, mm2
         punpcklbw   mm1, mm4
         punpckhbw   mm2, mm4
-        pmulhw      mm1, [s9 GLOBAL]
-        pmulhw      mm2, [s9 GLOBAL]
-        paddw       mm1, [s63 GLOBAL]
-        paddw       mm2, [s63 GLOBAL]
+        pmulhw      mm1, [GLOBAL(s9)]
+        pmulhw      mm2, [GLOBAL(s9)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
         psraw       mm1, 7
         psraw       mm2, 7
         packsswb    mm1, mm2
@@ -889,14 +889,14 @@ next8_mbh:
         neg         rax
         movq        mm3, [rdi+rax  ]
 
-        pxor        mm6, [t80 GLOBAL]
-        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
 
         paddsb      mm6, mm1
         psubsb      mm3, mm1
 
-        pxor        mm6, [t80 GLOBAL]
-        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
         movq        [rdi+rax  ], mm3
         neg         rax
         movq        [rdi+rax*4], mm6
@@ -1105,7 +1105,7 @@ next8_mbv:
         psubusb     mm5,        mm1                         ; q1-=p1
         psubusb     mm1,        mm2                         ; p1-=q1
         por         mm5,        mm1                         ; abs(p1-q1)
-        pand        mm5,        [tfe GLOBAL]                ; set lsb of each byte to zero
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
         psrlw       mm5,        1                           ; abs(p1-q1)/2
 
         mov         rdx,        arg(2) ;flimit                      ;
@@ -1155,14 +1155,14 @@ next8_mbv:
         ; start work on filters
         movq        mm2, [rdx+16]         ; p1
         movq        mm7, [rdx+40]         ; q1
-        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
-        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
         psubsb      mm2, mm7              ; p1 - q1
 
         movq        mm6, [rdx+24]         ; p0
         movq        mm0, [rdx+32]         ; q0
-        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
-        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
 
         movq        mm3, mm0              ; q0
         psubsb      mm0, mm6              ; q0 - p0
@@ -1176,7 +1176,7 @@ next8_mbv:
         pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
 
         movq        mm5,        mm2       ;
-        paddsb      mm5,        [t3 GLOBAL];
+        paddsb      mm5,        [GLOBAL(t3)];
 
         pxor        mm0, mm0              ; 0
         pxor        mm7, mm7              ; 0
@@ -1189,7 +1189,7 @@ next8_mbv:
 
         movq        mm5, mm0              ; Filter2
 
-        paddsb      mm2, [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
+        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
         pxor        mm0, mm0              ; 0
         pxor        mm7, mm7              ; 0
 
@@ -1222,10 +1222,10 @@ next8_mbv:
         pxor        mm2, mm2
         punpcklbw   mm1, mm4
         punpckhbw   mm2, mm4
-        pmulhw      mm1, [s27 GLOBAL]
-        pmulhw      mm2, [s27 GLOBAL]
-        paddw       mm1, [s63 GLOBAL]
-        paddw       mm2, [s63 GLOBAL]
+        pmulhw      mm1, [GLOBAL(s27)]
+        pmulhw      mm2, [GLOBAL(s27)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
         psraw       mm1, 7
         psraw       mm2, 7
         packsswb    mm1, mm2
@@ -1233,8 +1233,8 @@ next8_mbv:
         psubsb      mm3, mm1
         paddsb      mm6, mm1
 
-        pxor        mm3, [t80 GLOBAL]
-        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
         movq        [rdx+24], mm6
         movq        [rdx+32], mm3
 
@@ -1248,24 +1248,24 @@ next8_mbv:
         pxor        mm2, mm2
         punpcklbw   mm1, mm4
         punpckhbw   mm2, mm4
-        pmulhw      mm1, [s18 GLOBAL]
-        pmulhw      mm2, [s18 GLOBAL]
-        paddw       mm1, [s63 GLOBAL]
-        paddw       mm2, [s63 GLOBAL]
+        pmulhw      mm1, [GLOBAL(s18)]
+        pmulhw      mm2, [GLOBAL(s18)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
         psraw       mm1, 7
         psraw       mm2, 7
         packsswb    mm1, mm2
 
         movq        mm3, [rdx + 40]
         movq        mm6, [rdx + 16]       ; p1
-        pxor        mm3, [t80 GLOBAL]
-        pxor        mm6, [t80 GLOBAL]
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
 
         paddsb      mm6, mm1
         psubsb      mm3, mm1
 
-        pxor        mm6, [t80 GLOBAL]
-        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
         movq        [rdx + 40], mm3
         movq        [rdx + 16], mm6
 
@@ -1279,10 +1279,10 @@ next8_mbv:
         pxor        mm2, mm2
         punpcklbw   mm1, mm4
         punpckhbw   mm2, mm4
-        pmulhw      mm1, [s9 GLOBAL]
-        pmulhw      mm2, [s9 GLOBAL]
-        paddw       mm1, [s63 GLOBAL]
-        paddw       mm2, [s63 GLOBAL]
+        pmulhw      mm1, [GLOBAL(s9)]
+        pmulhw      mm2, [GLOBAL(s9)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
         psraw       mm1, 7
         psraw       mm2, 7
         packsswb    mm1, mm2
@@ -1290,14 +1290,14 @@ next8_mbv:
         movq        mm6, [rdx+ 8]
         movq        mm3, [rdx+48]
 
-        pxor        mm6, [t80 GLOBAL]
-        pxor        mm3, [t80 GLOBAL]
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
 
         paddsb      mm6, mm1
         psubsb      mm3, mm1
 
-        pxor        mm6, [t80 GLOBAL]           ; mm6 = 71 61 51 41 31 21 11 01
-        pxor        mm3, [t80 GLOBAL]           ; mm3 = 76 66 56 46 36 26 15 06
+        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
+        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
 
         ; tranpose and write back
         movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
@@ -1432,7 +1432,7 @@ nexts8_h:
         psubusb     mm0, mm1              ; q1-=p1
         psubusb     mm1, mm4              ; p1-=q1
         por         mm1, mm0              ; abs(p1-q1)
-        pand        mm1, [tfe GLOBAL]     ; set lsb of each byte to zero
+        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
         psrlw       mm1, 1                ; abs(p1-q1)/2
 
         movq        mm5, [rsi+rax]        ; p0
@@ -1450,12 +1450,12 @@ nexts8_h:
         pcmpeqb     mm5, mm3
 
         ; start work on filters
-        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
-        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
         psubsb      mm2, mm7              ; p1 - q1
 
-        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
-        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
         movq        mm3, mm0              ; q0
         psubsb      mm0, mm6              ; q0 - p0
         paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
@@ -1464,7 +1464,7 @@ nexts8_h:
         pand        mm5, mm2              ; mask filter values we don't care about
 
         ; do + 4 side
-        paddsb      mm5, [t4 GLOBAL]      ; 3* (q0 - p0) + (p1 - q1) + 4
+        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
 
         movq        mm0, mm5              ; get a copy of filters
         psllw       mm0, 8                ; shift left 8
@@ -1477,12 +1477,12 @@ nexts8_h:
         por         mm0, mm1              ; put the two together to get result
 
         psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [t80 GLOBAL]     ; unoffset
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
         movq        [rsi], mm3            ; write back
 
 
         ; now do +3 side
-        psubsb      mm5, [t1s GLOBAL]      ; +3 instead of +4
+        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
 
         movq        mm0, mm5              ; get a copy of filters
         psllw       mm0, 8                ; shift left 8
@@ -1494,7 +1494,7 @@ nexts8_h:
 
 
         paddsb      mm6, mm0              ; p0+= p0 add
-        pxor        mm6, [t80 GLOBAL]     ; unoffset
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
         movq        [rsi+rax], mm6        ; write back
 
         add         rsi,8
@@ -1589,7 +1589,7 @@ nexts8_v:
         psubusb     mm7,        mm6                             ; q1-=p1
         psubusb     mm6,        mm3                             ; p1-=q1
         por         mm6,        mm7                             ; abs(p1-q1)
-        pand        mm6,        [tfe GLOBAL]                    ; set lsb of each byte to zero
+        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
         psrlw       mm6,        1                               ; abs(p1-q1)/2
 
         movq        mm5,        mm1                             ; p0
@@ -1617,16 +1617,16 @@ nexts8_v:
         movq        t0,         mm0
         movq        t1,         mm3
 
-        pxor        mm0,        [t80 GLOBAL]                    ; p1 offset to convert to signed values
-        pxor        mm3,        [t80 GLOBAL]                    ; q1 offset to convert to signed values
+        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
+        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
 
         psubsb      mm0,        mm3                             ; p1 - q1
         movq        mm6,        mm1                             ; p0
 
         movq        mm7,        mm2                             ; q0
-        pxor        mm6,        [t80 GLOBAL]                    ; offset to convert to signed values
+        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
 
-        pxor        mm7,        [t80 GLOBAL]                    ; offset to convert to signed values
+        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
         movq        mm3,        mm7                             ; offseted ; q0
 
         psubsb      mm7,        mm6                             ; q0 - p0
@@ -1637,7 +1637,7 @@ nexts8_v:
 
         pand        mm5,        mm0                             ; mask filter values we don't care about
 
-        paddsb      mm5,        [t4 GLOBAL]                     ;  3* (q0 - p0) + (p1 - q1) + 4
+        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
 
         movq        mm0,        mm5                             ; get a copy of filters
         psllw       mm0,        8                               ; shift left 8
@@ -1651,10 +1651,10 @@ nexts8_v:
         por         mm0,        mm7                             ; put the two together to get result
 
         psubsb      mm3,        mm0                             ; q0-= q0sz add
-        pxor        mm3,        [t80 GLOBAL]                    ; unoffset
+        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
 
         ; now do +3 side
-        psubsb      mm5, [t1s GLOBAL]                           ; +3 instead of +4
+        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
 
         movq        mm0, mm5                                    ; get a copy of filters
         psllw       mm0, 8                                      ; shift left 8
@@ -1666,7 +1666,7 @@ nexts8_v:
         por         mm0, mm5                                    ; put the two together to get result
 
         paddsb      mm6, mm0                                    ; p0+= p0 add
-        pxor        mm6, [t80 GLOBAL]                           ; unoffset
+        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
 
 
         movq        mm0,        t0
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 57276b661..849133dc4 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -11,280 +11,266 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
 
-%macro LFH_FILTER_MASK 1
+%macro LFH_FILTER_AND_HEV_MASK 1
 %if %1
         movdqa      xmm2,                   [rdi+2*rax]       ; q3
         movdqa      xmm1,                   [rsi+2*rax]       ; q2
+        movdqa      xmm4,                   [rsi+rax]         ; q1
+        movdqa      xmm5,                   [rsi]             ; q0
+        neg         rax                     ; negate pitch to deal with above border
 %else
-        movq        xmm0,                   [rsi + rcx*2]     ; q3
-        movq        xmm2,                   [rdi + rcx*2]
-        pslldq      xmm2,                   8
-        por         xmm2,                   xmm0
-        movq        xmm1,                   [rsi + rcx]       ; q2
-        movq        xmm3,                   [rdi + rcx]
-        pslldq      xmm3,                   8
-        por         xmm1,                   xmm3
-        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
-%endif
+        movlps      xmm2,                   [rsi + rcx*2]     ; q3
+        movlps      xmm1,                   [rsi + rcx]       ; q2
+        movlps      xmm4,                   [rsi]             ; q1
+        movlps      xmm5,                   [rsi + rax]       ; q0
 
-        movdqa      xmm6,                   xmm1              ; q2
-        psubusb     xmm1,                   xmm2              ; q2-=q3
-        psubusb     xmm2,                   xmm6              ; q3-=q2
-        por         xmm1,                   xmm2              ; abs(q3-q2)
+        movhps      xmm2,                   [rdi + rcx*2]
+        movhps      xmm1,                   [rdi + rcx]
+        movhps      xmm4,                   [rdi]
+        movhps      xmm5,                   [rdi + rax]
 
-        psubusb     xmm1,                   xmm7
+        lea         rsi,                    [rsi + rax*4]
+        lea         rdi,                    [rdi + rax*4]
 
-%if %1
-        movdqa      xmm4,                   [rsi+rax]         ; q1
-%else
-        movq        xmm0,                   [rsi]             ; q1
-        movq        xmm4,                   [rdi]
-        pslldq      xmm4,                   8
-        por         xmm4,                   xmm0
+        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
         movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
 %endif
 
+        movdqa      xmm6,                   xmm1              ; q2
         movdqa      xmm3,                   xmm4              ; q1
+
+        psubusb     xmm1,                   xmm2              ; q2-=q3
+        psubusb     xmm2,                   xmm6              ; q3-=q2
+
         psubusb     xmm4,                   xmm6              ; q1-=q2
         psubusb     xmm6,                   xmm3              ; q2-=q1
-        por         xmm4,                   xmm6              ; abs(q2-q1)
-        psubusb     xmm4,                   xmm7
 
-        por         xmm1,                   xmm4
+        por         xmm4,                   xmm6              ; abs(q2-q1)
+        por         xmm1,                   xmm2              ; abs(q3-q2)
 
-%if %1
-        movdqa      xmm4,                   [rsi]             ; q0
-%else
-        movq        xmm4,                   [rsi + rax]       ; q0
-        movq        xmm0,                   [rdi + rax]
-        pslldq      xmm0,                   8
-        por         xmm4,                   xmm0
-%endif
+        movdqa      xmm0,                   xmm5              ; q0
+        pmaxub      xmm1,                   xmm4
 
-        movdqa      xmm0,                   xmm4              ; q0
-        psubusb     xmm4,                   xmm3              ; q0-=q1
+        psubusb     xmm5,                   xmm3              ; q0-=q1
         psubusb     xmm3,                   xmm0              ; q1-=q0
-        por         xmm4,                   xmm3              ; abs(q0-q1)
-        movdqa      t0,                     xmm4              ; save to t0
 
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
+        por         xmm5,                   xmm3              ; abs(q0-q1)
+        movdqa      t0,                     xmm5              ; save to t0
 
-%if %1
-        neg         rax                     ; negate pitch to deal with above border
+        pmaxub      xmm1,                   xmm5
 
+%if %1
         movdqa      xmm2,                   [rsi+4*rax]       ; p3
         movdqa      xmm4,                   [rdi+4*rax]       ; p2
+        movdqa      xmm6,                   [rsi+2*rax]       ; p1
 %else
-        lea         rsi,                    [rsi + rax*4]
-        lea         rdi,                    [rdi + rax*4]
+        movlps      xmm2,                   [rsi + rax]       ; p3
+        movlps      xmm4,                   [rsi]             ; p2
+        movlps      xmm6,                   [rsi + rcx]       ; p1
+
+        movhps      xmm2,                   [rdi + rax]
+        movhps      xmm4,                   [rdi]
+        movhps      xmm6,                   [rdi + rcx]
 
-        movq        xmm2,                   [rsi + rax]       ; p3
-        movq        xmm3,                   [rdi + rax]
-        pslldq      xmm3,                   8
-        por         xmm2,                   xmm3
-        movq        xmm4,                   [rsi]             ; p2
-        movq        xmm5,                   [rdi]
-        pslldq      xmm5,                   8
-        por         xmm4,                   xmm5
         movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
+        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
 %endif
 
         movdqa      xmm5,                   xmm4              ; p2
+        movdqa      xmm3,                   xmm6              ; p1
+
         psubusb     xmm4,                   xmm2              ; p2-=p3
         psubusb     xmm2,                   xmm5              ; p3-=p2
-        por         xmm4,                   xmm2              ; abs(p3 - p2)
 
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
+        psubusb     xmm3,                   xmm5              ; p1-=p2
+        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
 
-%if %1
-        movdqa      xmm4,                   [rsi+2*rax]       ; p1
-%else
-        movq        xmm4,                   [rsi + rcx]       ; p1
-        movq        xmm3,                   [rdi + rcx]
-        pslldq      xmm3,                   8
-        por         xmm4,                   xmm3
-        movdqa      XMMWORD PTR [rsp + 48], xmm4              ; store p1
-%endif
-
-        movdqa      xmm3,                   xmm4              ; p1
-        psubusb     xmm4,                   xmm5              ; p1-=p2
-        psubusb     xmm5,                   xmm3              ; p2-=p1
-        por         xmm4,                   xmm5              ; abs(p2 - p1)
-        psubusb     xmm4,                   xmm7
+        psubusb     xmm5,                   xmm6              ; p2-=p1
+        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
 
-        por         xmm1,                   xmm4
-        movdqa      xmm2,                   xmm3              ; p1
+        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
+        movdqa      xmm2,                   xmm6              ; p1
 
+        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
 %if %1
         movdqa      xmm4,                   [rsi+rax]         ; p0
+        movdqa      xmm3,                   [rdi]             ; q1
 %else
-        movq        xmm4,                   [rsi + rcx*2]     ; p0
-        movq        xmm5,                   [rdi + rcx*2]
-        pslldq      xmm5,                   8
-        por         xmm4,                   xmm5
+        movlps      xmm4,                   [rsi + rcx*2]     ; p0
+        movhps      xmm4,                   [rdi + rcx*2]
+        movdqa      xmm3,                   q1                ; q1
 %endif
 
         movdqa      xmm5,                   xmm4              ; p0
-        psubusb     xmm4,                   xmm3              ; p0-=p1
-        psubusb     xmm3,                   xmm5              ; p1-=p0
-        por         xmm4,                   xmm3              ; abs(p1 - p0)
-        movdqa        t1,                   xmm4              ; save to t1
+        psubusb     xmm4,                   xmm6              ; p0-=p1
 
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
+        psubusb     xmm6,                   xmm5              ; p1-=p0
 
-%if %1
-        movdqa      xmm3,                   [rdi]             ; q1
-%else
-        movdqa      xmm3,                   q1                ; q1
-%endif
+        por         xmm6,                   xmm4              ; abs(p1 - p0)
+        mov         rdx,                    arg(2)            ; get flimit
+
+        movdqa        t1,                   xmm6              ; save to t1
 
         movdqa      xmm4,                   xmm3              ; q1
+        pmaxub      xmm1,                   xmm6
+
         psubusb     xmm3,                   xmm2              ; q1-=p1
         psubusb     xmm2,                   xmm4              ; p1-=q1
+
+        psubusb     xmm1,                   xmm7
         por         xmm2,                   xmm3              ; abs(p1-q1)
-        pand        xmm2,                   [tfe GLOBAL]      ; set lsb of each byte to zero
-        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
 
-        movdqa      xmm6,                   xmm5              ; p0
+        movdqa      xmm4,                   XMMWORD PTR [rdx] ; flimit
+
         movdqa      xmm3,                   xmm0              ; q0
+        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
+
+        mov         rdx,                    arg(4)            ; hev get thresh
+
+        movdqa      xmm6,                   xmm5              ; p0
+        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
+
         psubusb     xmm5,                   xmm3              ; p0-=q0
+        paddb       xmm4,                   xmm4              ; flimit*2 (less than 255)
+
         psubusb     xmm3,                   xmm6              ; q0-=p0
         por         xmm5,                   xmm3              ; abs(p0 - q0)
+
         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
+        paddb       xmm7,                   xmm4              ; flimit * 2 + limit (less than 255)
+
+        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
+
+        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
+
         paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx,                    arg(2)            ; get flimit
-        movdqa      xmm2,                   XMMWORD PTR [rdx]
-        paddb       xmm2,                   xmm2              ; flimit*2 (less than 255)
-        paddb       xmm7,                   xmm2              ; flimit * 2 + limit (less than 255)
+        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
 
         psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm4,                   xmm2              ; hev
+
+        psubusb     xmm3,                   xmm2              ; hev
         por         xmm1,                   xmm5
-        pxor        xmm5,                   xmm5
-        pcmpeqb     xmm1,                   xmm5              ; mask mm1
-%endmacro
 
-%macro LFH_HEV_MASK 0
-        mov         rdx,                    arg(4)            ; get thresh
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
+        pxor        xmm7,                   xmm7
+        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
 
-        movdqa      xmm4,                   t0                ; get abs (q1 - q0)
-        psubusb     xmm4,                   xmm7
-        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
-        psubusb     xmm3,                   xmm7
-        paddb       xmm4,                   xmm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     xmm4,                   xmm5
+        pcmpeqb     xmm4,                   xmm5              ; hev
+        pcmpeqb     xmm3,                   xmm3              ; hev
 
-        pcmpeqb     xmm5,                   xmm5
-        pxor        xmm4,                   xmm5
+        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
+        pxor        xmm4,                   xmm3              ; hev
 %endmacro
 
-%macro BH_FILTER 1
-%if %1
-        movdqa      xmm2,                   [rsi+2*rax]       ; p1
-        movdqa      xmm7,                   [rdi]             ; q1
-%else
+%macro B_FILTER 1
+%if %1 == 0
         movdqa      xmm2,                   p1                ; p1
         movdqa      xmm7,                   q1                ; q1
+%elif %1 == 1
+        movdqa      xmm2,                   [rsi+2*rax]       ; p1
+        movdqa      xmm7,                   [rdi]             ; q1
+%elif %1 == 2
+        lea         rdx,                    srct
+
+        movdqa      xmm2,                   [rdx]             ; p1
+        movdqa      xmm7,                   [rdx+48]          ; q1
+        movdqa      xmm6,                   [rdx+16]          ; p0
+        movdqa      xmm0,                   [rdx+32]          ; q0
 %endif
 
-        pxor        xmm2,                   [t80 GLOBAL]      ; p1 offset to convert to signed values
-        pxor        xmm7,                   [t80 GLOBAL]      ; q1 offset to convert to signed values
+        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
+        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
 
         psubsb      xmm2,                   xmm7              ; p1 - q1
-        pxor        xmm6,                   [t80 GLOBAL]      ; offset to convert to signed values
+        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
 
         pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
-        pxor        xmm0,                   [t80 GLOBAL]      ; offset to convert to signed values
+        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
 
         movdqa      xmm3,                   xmm0              ; q0
         psubsb      xmm0,                   xmm6              ; q0 - p0
+
         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+
         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+
         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+
         pand        xmm1,                   xmm2              ; mask filter values we don't care about
+
         movdqa      xmm2,                   xmm1
-        paddsb      xmm1,                   [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      xmm2,                   [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
 
         punpckhbw   xmm5,                   xmm2              ; axbxcxdx
         punpcklbw   xmm2,                   xmm2              ; exfxgxhx
 
+        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
         psraw       xmm5,                   11                ; sign extended shift right by 3
-        psraw       xmm2,                   11                ; sign extended shift right by 3
-        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
 
-        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
         punpckhbw   xmm1,                   xmm1              ; axbxcxdx
+        psraw       xmm2,                   11                ; sign extended shift right by 3
 
+        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
         psraw       xmm0,                   11                ; sign extended shift right by 3
-        psraw       xmm1,                   11                ; sign extended shift right by 3
 
+        psraw       xmm1,                   11                ; sign extended shift right by 3
         movdqa      xmm5,                   xmm0              ; save results
-        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
 
-        paddsw      xmm5,                   [ones GLOBAL]
-        paddsw      xmm1,                   [ones GLOBAL]
+        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      xmm5,                   [GLOBAL(ones)]
 
+        paddsw      xmm1,                   [GLOBAL(ones)]
         psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
-        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
 
-        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-        pandn       xmm4,                   xmm5              ; high edge variance additive
-%endmacro
+        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
 
-%macro BH_WRITEBACK 1
         paddsb      xmm6,                   xmm2              ; p0+= p0 add
-        pxor        xmm6,                   [t80 GLOBAL]      ; unoffset
-%if %1
-        movdqa      [rsi+rax],              xmm6              ; write back
-%else
-        lea         rsi,                    [rsi + rcx*2]
-        lea         rdi,                    [rdi + rcx*2]
-        movq        MMWORD PTR [rsi],       xmm6              ; p0
-        psrldq      xmm6,                   8
-        movq        MMWORD PTR [rdi],       xmm6
-%endif
+        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
 
-%if %1
-        movdqa      xmm6,                   [rsi+2*rax]       ; p1
-%else
-        movdqa      xmm6,                   p1                ; p1
-%endif
-        pxor        xmm6,                   [t80 GLOBAL]      ; reoffset
-        paddsb      xmm6,                   xmm4               ; p1+= p1 add
-        pxor        xmm6,                   [t80 GLOBAL]      ; unoffset
-%if %1
-        movdqa      [rsi+2*rax],            xmm6              ; write back
-%else
-        movq        MMWORD PTR [rsi + rax], xmm6              ; p1
-        psrldq      xmm6,                   8
-        movq        MMWORD PTR [rdi + rax], xmm6
+%if %1 == 0
+        movdqa      xmm1,                   p1                ; p1
+%elif %1 == 1
+        movdqa      xmm1,                   [rsi+2*rax]       ; p1
+%elif %1 == 2
+        movdqa      xmm1,                   [rdx]             ; p1
 %endif
+        pandn       xmm4,                   xmm5              ; high edge variance additive
+        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
 
+        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
         psubsb      xmm3,                   xmm0              ; q0-= q0 add
-        pxor        xmm3,                   [t80 GLOBAL]      ; unoffset
-%if %1
-        movdqa      [rsi],                  xmm3              ; write back
-%else
-        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
-        psrldq      xmm3,                   8
-        movq        MMWORD PTR [rdi + rcx], xmm3
-%endif
 
+        paddsb      xmm1,                   xmm4              ; p1+= p1 add
+        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
+
+        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
         psubsb      xmm7,                   xmm4              ; q1-= q1 add
-        pxor        xmm7,                   [t80 GLOBAL]      ; unoffset
-%if %1
-        movdqa      [rdi],                  xmm7              ; write back
-%else
+
+        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
+%if %1 == 0
+        lea         rsi,                    [rsi + rcx*2]
+        lea         rdi,                    [rdi + rcx*2]
+        movq        MMWORD PTR [rsi],       xmm6              ; p0
+        movhps      MMWORD PTR [rdi],       xmm6
+        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
+        movhps      MMWORD PTR [rdi + rax], xmm1
+        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
+        movhps      MMWORD PTR [rdi + rcx], xmm3
         movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
-        psrldq      xmm7,                   8
-        movq        MMWORD PTR [rdi + rcx*2],xmm7
+        movhps      MMWORD PTR [rdi + rcx*2],xmm7
+%elif %1 == 1
+        movdqa      [rsi+rax],              xmm6              ; write back
+        movdqa      [rsi+2*rax],            xmm1              ; write back
+        movdqa      [rsi],                  xmm3              ; write back
+        movdqa      [rdi],                  xmm7              ; write back
 %endif
+
 %endmacro
 
 
@@ -321,16 +307,10 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
 
         lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
 
-        ; calculate breakout conditions
-        LFH_FILTER_MASK 1
-
-        ; calculate high edge variance
-        LFH_HEV_MASK
-
-        ; start work on filters
-        BH_FILTER 1
-        ; write back the result
-        BH_WRITEBACK 1
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 1
+        ; filter and write back the result
+        B_FILTER 1
 
     add rsp, 32
     pop rsp
@@ -385,15 +365,10 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
         lea         rsi,                    [rsi + rcx]
         lea         rdi,                    [rdi + rcx]
 
-        ; calculate breakout conditions
-        LFH_FILTER_MASK 0
-        ; calculate high edge variance
-        LFH_HEV_MASK
-
-        ; start work on filters
-        BH_FILTER 0
-        ; write back the result
-        BH_WRITEBACK 0
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 0
+        ; filter and write back the result
+        B_FILTER 0
 
     add rsp, 96
     pop rsp
@@ -407,208 +382,191 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
     ret
 
 
-%macro MBH_FILTER 1
-%if %1
-        movdqa      xmm2,                   [rsi+2*rax]       ; p1
-        movdqa      xmm7,                   [rdi]             ; q1
-%else
-        movdqa      xmm2,                   p1                ; p1
-        movdqa      xmm7,                   q1                ; q1
+%macro MB_FILTER_AND_WRITEBACK 1
+%if %1 == 0
+        movdqa      xmm2,                   p1              ; p1
+        movdqa      xmm7,                   q1              ; q1
+%elif %1 == 1
+        movdqa      xmm2,                   [rsi+2*rax]     ; p1
+        movdqa      xmm7,                   [rdi]           ; q1
+
+        mov         rcx,                    rax
+        neg         rcx
+%elif %1 == 2
+        lea         rdx,                    srct
+
+        movdqa      xmm2,                   [rdx+32]        ; p1
+        movdqa      xmm7,                   [rdx+80]        ; q1
+        movdqa      xmm6,                   [rdx+48]        ; p0
+        movdqa      xmm0,                   [rdx+64]        ; q0
 %endif
-        pxor        xmm2,                   [t80 GLOBAL]      ; p1 offset to convert to signed values
-        pxor        xmm7,                   [t80 GLOBAL]      ; q1 offset to convert to signed values
 
-        psubsb      xmm2,                   xmm7              ; p1 - q1
-        pxor        xmm6,                   [t80 GLOBAL]      ; offset to convert to signed values
-        pxor        xmm0,                   [t80 GLOBAL]      ; offset to convert to signed values
-        movdqa      xmm3,                   xmm0              ; q0
-        psubsb      xmm0,                   xmm6              ; q0 - p0
-        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + (p1 - q1)
-        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0)
-        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + (p1 - q1)
+        pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
+        pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
+        pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
+        pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
 
-        pand        xmm1,                   xmm2              ; mask filter values we don't care about
-        movdqa      xmm2,                   xmm1              ; vp8_filter
-        pand        xmm2,                   xmm4;             ; Filter2 = vp8_filter & hev
+        psubsb      xmm2,                   xmm7            ; p1 - q1
+        movdqa      xmm3,                   xmm0            ; q0
 
-        movdqa      xmm5,                   xmm2
-        paddsb      xmm5,                   [t3 GLOBAL]       ; vp8_signed_char_clamp(Filter2 + 3)
+        psubsb      xmm0,                   xmm6            ; q0 - p0
 
-        punpckhbw   xmm7,                   xmm5              ; axbxcxdx
-        punpcklbw   xmm5,                   xmm5              ; exfxgxhx
+        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
 
-        psraw       xmm7,                   11                ; sign extended shift right by 3
-        psraw       xmm5,                   11                ; sign extended shift right by 3
-
-        packsswb    xmm5,                   xmm7              ; Filter2 >>=3;
-        paddsb      xmm2,                   [t4 GLOBAL]       ; vp8_signed_char_clamp(Filter2 + 4)
+        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
 
-        punpckhbw   xmm7,                   xmm2              ; axbxcxdx
-        punpcklbw   xmm0,                   xmm2              ; exfxgxhx
+        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
 
-        psraw       xmm7,                   11                ; sign extended shift right by 3
-        psraw       xmm0,                   11                ; sign extended shift right by 3
+        pand        xmm1,                   xmm2            ; mask filter values we don't care about
 
-        packsswb    xmm0,                   xmm7              ; Filter2 >>=3;
-        paddsb      xmm6,                   xmm5              ; ps0 =ps0 + Fitler2
+        movdqa      xmm2,                   xmm1            ; vp8_filter
 
-        psubsb      xmm3,                   xmm0              ; qs0 =qs0 - filter1
-        pandn       xmm4,                   xmm1              ; vp8_filter&=~hev
-%endmacro
+        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
+        pxor        xmm0,                   xmm0
 
-%macro MBH_WRITEBACK 1
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
-        ; s = vp8_signed_char_clamp(qs0 - u);
-        ; *oq0 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps0 + u);
-        ; *op0 = s^0x80;
+        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
         pxor        xmm1,                   xmm1
 
-        pxor        xmm2,                   xmm2
-        punpcklbw   xmm1,                   xmm4
+        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
+        movdqa      xmm5,                   xmm2
 
-        punpckhbw   xmm2,                   xmm4
-        pmulhw      xmm1,                   [s27 GLOBAL]
+        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
+        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
 
-        pmulhw      xmm2,                   [s27 GLOBAL]
-        paddw       xmm1,                   [s63 GLOBAL]
+        pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
 
-        paddw       xmm2,                   [s63 GLOBAL]
-        psraw       xmm1,                   7
+        pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
 
-        psraw       xmm2,                   7
-        packsswb    xmm1,                   xmm2
+        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
+        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
 
-        psubsb      xmm3,                   xmm1
-        paddsb      xmm6,                   xmm1
+        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
+        psraw       xmm7,                   11              ; sign extended shift right by 3
 
-        pxor        xmm3,                   [t80 GLOBAL]
-        pxor        xmm6,                   [t80 GLOBAL]
+        psraw       xmm5,                   11              ; sign extended shift right by 3
+        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
 
-%if %1
-        movdqa      XMMWORD PTR [rsi+rax],  xmm6
-        movdqa      XMMWORD PTR [rsi],      xmm3
-%else
-        lea         rsi,                    [rsi + rcx*2]
-        lea         rdi,                    [rdi + rcx*2]
+        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
+        psraw       xmm4,                   11              ; sign extended shift right by 3
 
-        movq        MMWORD PTR [rsi],       xmm6              ; p0
-        psrldq      xmm6,                   8
-        movq        MMWORD PTR [rdi],       xmm6
-        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
-        psrldq      xmm3,                   8
-        movq        MMWORD PTR [rdi + rcx], xmm3
-%endif
+        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
+        psraw       xmm2,                   11              ; sign extended shift right by 3
 
-        ; roughly 2/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
-        ; s = vp8_signed_char_clamp(qs1 - u);
-        ; *oq1 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps1 + u);
-        ; *op1 = s^0x80;
-        pxor        xmm1,                   xmm1
-        pxor        xmm2,                   xmm2
+        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
+        movdqa      xmm7,                   xmm1
 
-        punpcklbw   xmm1,                   xmm4
-        punpckhbw   xmm2,                   xmm4
+        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
+        movdqa      xmm4,                   xmm1
 
-        pmulhw      xmm1,                   [s18 GLOBAL]
-        pmulhw      xmm2,                   [s18 GLOBAL]
+        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
+        movdqa      xmm5,                   xmm0
 
-        paddw       xmm1,                   [s63 GLOBAL]
-        paddw       xmm2,                   [s63 GLOBAL]
+        movdqa      xmm2,                   xmm5
+        paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
 
-        psraw       xmm1,                   7
-        psraw       xmm2,                   7
+        paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
+        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
 
-        packsswb    xmm1,                   xmm2
+        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
+        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
 
-%if %1
-        movdqa      xmm3,                   XMMWORD PTR [rdi]
-        movdqa      xmm6,                   XMMWORD PTR [rsi+rax*2] ; p1
-%else
-        movdqa      xmm3,                   q1                ; q1
-        movdqa      xmm6,                   p1                ; p1
-%endif
+        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
+        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
 
-        pxor        xmm3,                   [t80 GLOBAL]
-        pxor        xmm6,                   [t80 GLOBAL]
+        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
+        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
 
-        paddsb      xmm6,                   xmm1
-        psubsb      xmm3,                   xmm1
+        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
+        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
 
-        pxor        xmm6,                   [t80 GLOBAL]
-        pxor        xmm3,                   [t80 GLOBAL]
+        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
 
-%if %1
-        movdqa      XMMWORD PTR [rdi],      xmm3
-        movdqa      XMMWORD PTR [rsi+rax*2],xmm6
-%else
-        movq        MMWORD PTR [rsi + rcx*2],xmm3             ; q1
-        psrldq      xmm3,                   8
-        movq        MMWORD PTR [rdi + rcx*2],xmm3
+        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
+        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
 
-        movq        MMWORD PTR [rsi + rax], xmm6              ; p1
-        psrldq      xmm6,                   8
-        movq        MMWORD PTR [rdi + rax], xmm6
+        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
+        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
+
+%if %1 == 0
+        movdqa      xmm5,                   q2              ; q2
+        movdqa      xmm1,                   q1              ; q1
+        movdqa      xmm4,                   p1              ; p1
+        movdqa      xmm7,                   p2              ; p2
+
+%elif %1 == 1
+        movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
+        movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
+        movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
+        movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
+%elif %1 == 2
+        movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
+        movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
+        movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
+        movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
 %endif
-        ; roughly 1/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
-        ; s = vp8_signed_char_clamp(qs2 - u);
-        ; *oq2 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps2 + u);
-        ; *op2 = s^0x80;
-        pxor        xmm1,                   xmm1
-        pxor        xmm2,                   xmm2
 
-        punpcklbw   xmm1,                   xmm4
-        punpckhbw   xmm2,                   xmm4
+        pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
+        pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
 
-        pmulhw      xmm1,                   [s9 GLOBAL]
-        pmulhw      xmm2,                   [s9 GLOBAL]
+        pxor        xmm1,                   [GLOBAL(t80)]
+        pxor        xmm4,                   [GLOBAL(t80)]
 
-        paddw       xmm1,                   [s63 GLOBAL]
-        paddw       xmm2,                   [s63 GLOBAL]
+        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
+        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
 
-        psraw       xmm1,                   7
-        psraw       xmm2,                   7
+        pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
+        pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
 
-        packsswb    xmm1,                   xmm2
+        pxor        xmm7,                   [GLOBAL(t80)]
+        pxor        xmm5,                   [GLOBAL(t80)]
 
-%if %1
-        movdqa      xmm6,                   XMMWORD PTR [rdi+rax*4]
-        neg         rax
+        paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
+        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
 
-        movdqa      xmm3,                   XMMWORD PTR [rdi+rax]
-%else
-        movdqa      xmm6,                   p2                ; p2
-        movdqa      xmm3,                   q2                ; q2
-%endif
+        pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
+        pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
 
-        pxor        xmm6,                   [t80 GLOBAL]
-        pxor        xmm3,                   [t80 GLOBAL]
+%if %1 == 0
+        lea         rsi,                    [rsi+rcx*2]
+        lea         rdi,                    [rdi+rcx*2]
 
-        paddsb      xmm6,                   xmm1
-        psubsb      xmm3,                   xmm1
+        movq        MMWORD PTR [rsi],       xmm6            ; p0
+        movhps      MMWORD PTR [rdi],       xmm6
+        movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
+        movhps      MMWORD PTR [rdi + rcx], xmm3
 
-        pxor        xmm6,                   [t80 GLOBAL]
-        pxor        xmm3,                   [t80 GLOBAL]
-%if %1
-        movdqa      XMMWORD PTR [rdi+rax  ],xmm3
-        neg         rax
+        movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
+        movhps      MMWORD PTR [rdi+rcx*2], xmm1
 
-        movdqa      XMMWORD PTR [rdi+rax*4],xmm6
-%else
-        movq        MMWORD PTR [rsi+rax*2], xmm6              ; p2
-        psrldq      xmm6,                   8
-        movq        MMWORD PTR [rdi+rax*2], xmm6
+        movq        MMWORD PTR [rsi + rax], xmm4            ; p1
+        movhps      MMWORD PTR [rdi + rax], xmm4
+
+        movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
+        movhps      MMWORD PTR [rdi+rax*2], xmm7
 
         lea         rsi,                    [rsi + rcx]
         lea         rdi,                    [rdi + rcx]
-        movq        MMWORD PTR [rsi+rcx*2  ],xmm3             ; q2
-        psrldq      xmm3,                   8
-        movq        MMWORD PTR [rdi+rcx*2  ],xmm3
+        movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
+        movhps      MMWORD PTR [rdi+rcx*2], xmm5
+%elif %1 == 1
+        movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
+        movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
+        movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
+        movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
+        movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
+        movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
+%elif %1 == 2
+        movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
+        movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
+        movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
+        movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
 %endif
+
 %endmacro
 
 
@@ -645,16 +603,10 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2):
 
         lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
 
-        ; calculate breakout conditions
-        LFH_FILTER_MASK 1
-
-        ; calculate high edge variance
-        LFH_HEV_MASK
-
-        ; start work on filters
-        MBH_FILTER 1
-        ; write back the result
-        MBH_WRITEBACK 1
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 1
+        ; filter and write back the results
+        MB_FILTER_AND_WRITEBACK 1
 
     add rsp, 32
     pop rsp
@@ -709,16 +661,10 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
         lea         rsi,                    [rsi + rcx]
         lea         rdi,                    [rdi + rcx]
 
-        ; calculate breakout conditions
-        LFH_FILTER_MASK 0
-
-        ; calculate high edge variance
-        LFH_HEV_MASK
-
-        ; start work on filters
-        MBH_FILTER 0
-        ; write back the result
-        MBH_WRITEBACK 0
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 0
+        ; filter and write back the results
+        MB_FILTER_AND_WRITEBACK 0
 
     add rsp, 96
     pop rsp
@@ -732,64 +678,80 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
     ret
 
 
-%macro TRANSPOSE_16X8_1 0
-        movq        xmm4,               QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
-        movq        xmm7,               QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
-
-        punpcklbw   xmm4,               xmm7            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+%macro TRANSPOSE_16X8 2
+        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
         movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
 
-        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
 
-        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
         punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
 
-        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
-        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
 
         punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+%if %1
+        lea         rsi,                [rsi+rax*8]
+%else
+        mov         rsi,                arg(5)          ; v_ptr
+%endif
 
-        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
         movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-
         punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+
         punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
 
         punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+%if %1
+        lea         rdi,                [rdi+rax*8]
+%else
+        lea         rsi,                [rsi - 4]
+%endif
 
         punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+%if %1
+        lea         rdx,                srct
+%else
+        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
+%endif
+
+        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
         punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
 
         movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 
         punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
         punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
 
-        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
         punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
 
         movdqa      t0,                 xmm2            ; save to free XMM2
-%endmacro
+        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
 
-%macro TRANSPOSE_16X8_2 1
-        movq        xmm2,               QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
-        movq        xmm5,               QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
 
-        punpcklbw   xmm2,               xmm5            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
 
-        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
         punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
 
-        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
-        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
 
         punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
 
         movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
         punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
 
         movdqa      xmm6,               xmm1            ;
@@ -799,92 +761,97 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
         movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
 
         punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
         punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
 
         movdqa      xmm0,               xmm5
         punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
 
-
         punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
         movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
 
         punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
 
+        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
         movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
         punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
 
         punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
-%if %1
+%if %2
         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 
         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
         movdqa      [rdx],              xmm2            ; save 2
 
         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 
         movdqa      [rdx+16],           xmm3            ; save 3
+
         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
 
         movdqa      [rdx+32],           xmm4            ; save 4
         movdqa      [rdx+48],           xmm5            ; save 5
-
         movdqa      xmm1,               t0              ; get
-        movdqa      xmm2,               xmm1            ;
 
+        movdqa      xmm2,               xmm1            ;
         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+
         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
 %else
         movdqa      [rdx+112],          xmm7            ; save 7
-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
 
         movdqa      [rdx+96],           xmm6            ; save 6
-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
 
+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
         movdqa      [rdx+32],           xmm2            ; save 2
 
         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
 
         movdqa      [rdx+48],           xmm3            ; save 3
+
         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
 
         movdqa      [rdx+64],           xmm4            ; save 4
         movdqa      [rdx+80],           xmm5            ; save 5
-
         movdqa      xmm1,               t0              ; get
-        movdqa      xmm2,               xmm1
 
+        movdqa      xmm2,               xmm1
         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+
         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
 
         movdqa      [rdx+16],           xmm1
+
         movdqa      [rdx],              xmm2
 %endif
 %endmacro
 
-%macro LFV_FILTER_MASK 1
+%macro LFV_FILTER_MASK_HEV_MASK 1
         movdqa      xmm0,               xmm6            ; q2
         psubusb     xmm0,               xmm7            ; q2-q3
 
         psubusb     xmm7,               xmm6            ; q3-q2
-        por         xmm7,               xmm0            ; abs (q3-q2)
-
         movdqa      xmm4,               xmm5            ; q1
-        psubusb     xmm4,               xmm6            ; q1-q2
 
-        psubusb     xmm6,               xmm5            ; q2-q1
-        por         xmm6,               xmm4            ; abs (q2-q1)
+        por         xmm7,               xmm0            ; abs (q3-q2)
+        psubusb     xmm4,               xmm6            ; q1-q2
 
         movdqa      xmm0,               xmm1
+        psubusb     xmm6,               xmm5            ; q2-q1
 
+        por         xmm6,               xmm4            ; abs (q2-q1)
         psubusb     xmm0,               xmm2            ; p2 - p3;
-        psubusb     xmm2,               xmm1            ; p3 - p2;
 
+        psubusb     xmm2,               xmm1            ; p3 - p2;
         por         xmm0,               xmm2            ; abs(p2-p3)
 %if %1
         movdqa      xmm2,               [rdx]           ; p1
@@ -892,39 +859,29 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
         movdqa      xmm2,               [rdx+32]        ; p1
 %endif
         movdqa      xmm5,               xmm2            ; p1
+        pmaxub      xmm0,               xmm7
 
         psubusb     xmm5,               xmm1            ; p1-p2
         psubusb     xmm1,               xmm2            ; p2-p1
 
-        por         xmm1,               xmm5            ; abs(p2-p1)
-
-        mov         rdx,                arg(3)          ; limit
-        movdqa      xmm4,               [rdx]           ; limit
-
-        psubusb     xmm7,               xmm4
-
-        psubusb     xmm0,               xmm4            ; abs(p3-p2) > limit
-        psubusb     xmm1,               xmm4            ; abs(p2-p1) > limit
-
-        psubusb     xmm6,               xmm4            ; abs(q2-q1) > limit
-        por         xmm7,               xmm6            ; or
+        movdqa      xmm7,               xmm3            ; p0
+        psubusb     xmm7,               xmm2            ; p0-p1
 
-        por         xmm0,               xmm1
-        por         xmm0,               xmm7            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+        por         xmm1,               xmm5            ; abs(p2-p1)
+        pmaxub      xmm0,               xmm6
 
+        pmaxub      xmm0,               xmm1
         movdqa      xmm1,               xmm2            ; p1
 
-        movdqa      xmm7,               xmm3            ; p0
-        psubusb     xmm7,               xmm2            ; p0-p1
-
         psubusb     xmm2,               xmm3            ; p1-p0
+        lea         rdx,                srct
+
         por         xmm2,               xmm7            ; abs(p1-p0)
 
         movdqa      t0,                 xmm2            ; save abs(p1-p0)
-        lea         rdx,                srct
 
-        psubusb     xmm2,               xmm4            ; abs(p1-p0)>limit
-        por         xmm0,               xmm2            ; mask
+        pmaxub      xmm0,               xmm2
+
 %if %1
         movdqa      xmm5,               [rdx+32]        ; q0
         movdqa      xmm7,               [rdx+48]        ; q1
@@ -932,133 +889,70 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
         movdqa      xmm5,               [rdx+64]        ; q0
         movdqa      xmm7,               [rdx+80]        ; q1
 %endif
+        mov         rdx,                arg(3)          ; limit
+
         movdqa      xmm6,               xmm5            ; q0
         movdqa      xmm2,               xmm7            ; q1
-        psubusb     xmm5,               xmm7            ; q0-q1
 
+        psubusb     xmm5,               xmm7            ; q0-q1
         psubusb     xmm7,               xmm6            ; q1-q0
+
         por         xmm7,               xmm5            ; abs(q1-q0)
 
         movdqa      t1,                 xmm7            ; save abs(q1-q0)
-        psubusb     xmm7,               xmm4            ; abs(q1-q0)> limit
 
-        por         xmm0,               xmm7            ; mask
+        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
+
+        pmaxub      xmm0,               xmm7
+        mov         rdx,                arg(2)          ; flimit
 
+        psubusb     xmm0,               xmm4
         movdqa      xmm5,               xmm2            ; q1
+
         psubusb     xmm5,               xmm1            ; q1-=p1
         psubusb     xmm1,               xmm2            ; p1-=q1
-        por         xmm5,               xmm1            ; abs(p1-q1)
-        pand        xmm5,               [tfe GLOBAL]    ; set lsb of each byte to zero
-        psrlw       xmm5,               1               ; abs(p1-q1)/2
-
-        mov         rdx,                arg(2)          ; flimit
-        movdqa      xmm2,               [rdx]           ; flimit
 
+        por         xmm5,               xmm1            ; abs(p1-q1)
         movdqa      xmm1,               xmm3            ; p0
-        movdqa      xmm7,               xmm6            ; q0
-        psubusb     xmm1,               xmm7            ; p0-q0
-        psubusb     xmm7,               xmm3            ; q0-p0
-        por         xmm1,               xmm7            ; abs(q0-p0)
-        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
-        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
-        paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
-
-        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
-        por         xmm1,               xmm0;           ; mask
-        pxor        xmm0,               xmm0
-        pcmpeqb     xmm1,               xmm0
-%endmacro
-
-%macro LFV_HEV_MASK 0
-        mov         rdx,                arg(4)          ; get thresh
-        movdqa      xmm7,               XMMWORD PTR [rdx]
 
-        movdqa      xmm4,               t0              ; get abs (q1 - q0)
-        psubusb     xmm4,               xmm7            ; abs(q1 - q0) > thresh
-
-        movdqa      xmm3,               t1              ; get abs (p1 - p0)
-        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
+        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
+        psubusb     xmm1,               xmm6            ; p0-q0
 
-        por         xmm4,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     xmm4,               xmm0
-
-        pcmpeqb     xmm0,               xmm0
-        pxor        xmm4,               xmm0
-%endmacro
-
-%macro BV_FILTER 0
-        lea         rdx,                srct
-
-        movdqa      xmm2,               [rdx]           ; p1        lea         rsi,       [rsi+rcx*8]
-        movdqa      xmm7,               [rdx+48]        ; q1
-        movdqa      xmm6,               [rdx+16]        ; p0
-        movdqa      xmm0,               [rdx+32]        ; q0
-
-        pxor        xmm2,               [t80 GLOBAL]    ; p1 offset to convert to signed values
-        pxor        xmm7,               [t80 GLOBAL]    ; q1 offset to convert to signed values
-
-        psubsb      xmm2,               xmm7            ; p1 - q1
-        pand        xmm2,               xmm4            ; high var mask (hvm)(p1 - q1)
-
-        pxor        xmm6,               [t80 GLOBAL]    ; offset to convert to signed values
-        pxor        xmm0,               [t80 GLOBAL]    ; offset to convert to signed values
-
-        movdqa      xmm3,               xmm0            ; q0
-        psubsb      xmm0,               xmm6            ; q0 - p0
-
-        paddsb      xmm2,               xmm0            ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      xmm2,               xmm0            ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      xmm2,               xmm0            ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand        xmm1,               xmm2            ; mask filter values we don't care about
-
-        movdqa      xmm2,               xmm1
-        paddsb      xmm1,               [t4 GLOBAL]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
-        paddsb      xmm2,               [t3 GLOBAL]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        punpckhbw   xmm5,               xmm2
-        punpcklbw   xmm2,               xmm2
+        psrlw       xmm5,               1               ; abs(p1-q1)/2
+        psubusb     xmm6,               xmm3            ; q0-p0
 
-        psraw       xmm5,               11
-        psraw       xmm2,               11
+        movdqa      xmm2,               XMMWORD PTR [rdx]; flimit
 
-        packsswb    xmm2,               xmm5            ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-        punpcklbw   xmm0,               xmm1            ; exfxgxhx
+        mov         rdx,                arg(4)          ; get thresh
 
-        punpckhbw   xmm1,               xmm1            ; axbxcxdx
-        psraw       xmm0,               11              ; sign extended shift right by 3
+        por         xmm1,               xmm6            ; abs(q0-p0)
+        paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
 
-        psraw       xmm1,               11              ; sign extended shift right by 3
-        movdqa      xmm5,               xmm0            ; save results
+        movdqa      xmm6,               t0              ; get abs (q1 - q0)
 
-        packsswb    xmm0,               xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      xmm5,               [ones GLOBAL]
+        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
 
-        paddsw      xmm1,               [ones GLOBAL]
-        psraw       xmm5,               1               ; partial shifted one more time for 2nd tap
+        movdqa      xmm3,               t1              ; get abs (p1 - p0)
 
-        psraw       xmm1,               1               ; partial shifted one more time for 2nd tap
-        packsswb    xmm5,               xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        movdqa      xmm7,               XMMWORD PTR [rdx]
 
-        pandn       xmm4,               xmm5            ; high edge variance additive
+        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
 
-        paddsb      xmm6,               xmm2            ; p0+= p0 add
-        pxor        xmm6,               [t80 GLOBAL]    ; unoffset
+        paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
+        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
 
-        movdqa      xmm1,               [rdx]           ; p1
-        pxor        xmm1,               [t80 GLOBAL]    ; reoffset
+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
 
-        paddsb      xmm1,               xmm4            ; p1+= p1 add
-        pxor        xmm1,               [t80 GLOBAL]    ; unoffset
+        por         xmm1,               xmm0            ; mask
+        pcmpeqb     xmm6,               xmm0
 
-        psubsb      xmm3,               xmm0            ; q0-= q0 add
-        pxor        xmm3,               [t80 GLOBAL]    ; unoffset
+        pxor        xmm0,               xmm0
+        pcmpeqb     xmm4,               xmm4
 
-        psubsb      xmm7,               xmm4            ; q1-= q1 add
-        pxor        xmm7,               [t80 GLOBAL]    ; unoffset
+        pcmpeqb     xmm1,               xmm0
+        pxor        xmm4,               xmm6
 %endmacro
 
 %macro BV_TRANSPOSE 0
@@ -1073,6 +967,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
         punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
 
         punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
         punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
 
         movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
@@ -1082,6 +977,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
         movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
 
         punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
         punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
         ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
         ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
@@ -1148,20 +1044,13 @@ sym(vp8_loop_filter_vertical_edge_sse2):
         lea         rcx,        [rax*2+rax]
 
         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
-        TRANSPOSE_16X8_1
+        TRANSPOSE_16X8 1, 1
 
-        lea         rsi,        [rsi+rax*8]
-        lea         rdi,        [rdi+rax*8]
-        lea         rdx,        srct
-        TRANSPOSE_16X8_2 1
-
-        ; calculate filter mask
-        LFV_FILTER_MASK 1
-        ; calculate high edge variance
-        LFV_HEV_MASK
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK 1
 
         ; start work on filters
-        BV_FILTER
+        B_FILTER 2
 
         ; tranpose and write back - only work on q1, q0, p0, p1
         BV_TRANSPOSE
@@ -1221,23 +1110,16 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
         lea         rcx,        [rax+2*rax]
 
-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
-        TRANSPOSE_16X8_1
-
-        mov         rsi,        arg(5)                  ; v_ptr
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-
         lea         rdx,        srct
-        TRANSPOSE_16X8_2 1
 
-        ; calculate filter mask
-        LFV_FILTER_MASK 1
-        ; calculate high edge variance
-        LFV_HEV_MASK
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8 0, 1
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK 1
 
         ; start work on filters
-        BV_FILTER
+        B_FILTER 2
 
         ; tranpose and write back - only work on q1, q0, p0, p1
         BV_TRANSPOSE
@@ -1263,174 +1145,12 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
     pop         rbp
     ret
 
-
-%macro MBV_FILTER 0
-        lea         rdx,                srct
-
-        movdqa      xmm2,               [rdx+32]        ; p1
-        movdqa      xmm7,               [rdx+80]        ; q1
-        movdqa      xmm6,               [rdx+48]        ; p0
-        movdqa      xmm0,               [rdx+64]        ; q0
-
-        pxor        xmm2,               [t80 GLOBAL]    ; p1 offset to convert to signed values
-        pxor        xmm7,               [t80 GLOBAL]    ; q1 offset to convert to signed values
-        pxor        xmm6,               [t80 GLOBAL]    ; offset to convert to signed values
-        pxor        xmm0,               [t80 GLOBAL]    ; offset to convert to signed values
-
-        psubsb      xmm2,               xmm7            ; p1 - q1
-
-        movdqa      xmm3,               xmm0            ; q0
-
-        psubsb      xmm0,               xmm6            ; q0 - p0
-        paddsb      xmm2,               xmm0            ; 1 * (q0 - p0) + (p1 - q1)
-
-        paddsb      xmm2,               xmm0            ; 2 * (q0 - p0)
-        paddsb      xmm2,               xmm0            ; 3 * (q0 - p0)+ (p1 - q1)
-
-        pand        xmm1,               xmm2            ; mask filter values we don't care about
-
-        movdqa      xmm2,               xmm1            ; vp8_filter
-        pand        xmm2,               xmm4;           ; Filter2 = vp8_filter & hev
-
-        movdqa      xmm5,               xmm2
-        paddsb      xmm5,               [t3 GLOBAL]
-
-        punpckhbw   xmm7,               xmm5            ; axbxcxdx
-        punpcklbw   xmm5,               xmm5            ; exfxgxhx
-
-        psraw       xmm7,               11              ; sign extended shift right by 3
-        psraw       xmm5,               11              ; sign extended shift right by 3
-
-        packsswb    xmm5,               xmm7            ; Filter2 >>=3;
-
-        paddsb      xmm2,               [t4 GLOBAL]     ; vp8_signed_char_clamp(Filter2 + 4)
-
-        punpcklbw   xmm0,               xmm2            ; exfxgxhx
-        punpckhbw   xmm7,               xmm2            ; axbxcxdx
-
-        psraw       xmm0,               11              ; sign extended shift right by 3
-        psraw       xmm7,               11              ; sign extended shift right by 3
-
-        packsswb    xmm0,               xmm7            ; Filter2 >>=3;
-
-        psubsb      xmm3,               xmm0            ; qs0 =qs0 - filter1
-        paddsb      xmm6,               xmm5            ; ps0 =ps0 + Fitler2
-
-        ; vp8_filter &= ~hev;
-        ; Filter2 = vp8_filter;
-        pandn       xmm4,               xmm1            ; vp8_filter&=~hev
-
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
-        ; s = vp8_signed_char_clamp(qs0 - u);
-        ; *oq0 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps0 + u);
-        ; *op0 = s^0x80;
-        pxor        xmm1,               xmm1
-
-        pxor        xmm2,               xmm2
-        punpcklbw   xmm1,               xmm4
-
-        punpckhbw   xmm2,               xmm4
-        pmulhw      xmm1,               [s27 GLOBAL]
-
-        pmulhw      xmm2,               [s27 GLOBAL]
-        paddw       xmm1,               [s63 GLOBAL]
-
-        paddw       xmm2,               [s63 GLOBAL]
-        psraw       xmm1,               7
-
-        psraw       xmm2,               7
-        packsswb    xmm1,               xmm2
-
-        psubsb      xmm3,               xmm1
-        paddsb      xmm6,               xmm1
-
-        pxor        xmm3,               [t80 GLOBAL]
-        pxor        xmm6,               [t80 GLOBAL]
-
-        movdqa      [rdx+48],           xmm6
-        movdqa      [rdx+64],           xmm3
-
-        ; roughly 2/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
-        ; s = vp8_signed_char_clamp(qs1 - u);
-        ; *oq1 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps1 + u);
-        ; *op1 = s^0x80;
-        pxor        xmm1,               xmm1
-        pxor        xmm2,               xmm2
-
-        punpcklbw   xmm1,               xmm4
-        punpckhbw   xmm2,               xmm4
-
-        pmulhw      xmm1,               [s18 GLOBAL]
-        pmulhw      xmm2,               [s18 GLOBAL]
-
-        paddw       xmm1,               [s63 GLOBAL]
-        paddw       xmm2,               [s63 GLOBAL]
-
-        psraw       xmm1,               7
-        psraw       xmm2,               7
-
-        packsswb    xmm1,               xmm2
-
-        movdqa      xmm3,               [rdx + 80]              ; q1
-        movdqa      xmm6,               [rdx + 32]              ; p1
-
-        pxor        xmm3,               [t80 GLOBAL]
-        pxor        xmm6,               [t80 GLOBAL]
-
-        paddsb      xmm6,               xmm1
-        psubsb      xmm3,               xmm1
-
-        pxor        xmm6,               [t80 GLOBAL]
-        pxor        xmm3,               [t80 GLOBAL]
-
-        movdqa      [rdx + 80],         xmm3
-        movdqa      [rdx + 32],         xmm6
-
-        ; roughly 1/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
-        ; s = vp8_signed_char_clamp(qs2 - u);
-        ; *oq2 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps2 + u);
-        ; *op2 = s^0x80;
-        pxor        xmm1,               xmm1
-        pxor        xmm2,               xmm2
-
-        punpcklbw   xmm1,               xmm4
-        punpckhbw   xmm2,               xmm4
-
-        pmulhw      xmm1,               [s9 GLOBAL]
-        pmulhw      xmm2,               [s9 GLOBAL]
-
-        paddw       xmm1,               [s63 GLOBAL]
-        paddw       xmm2,               [s63 GLOBAL]
-
-        psraw       xmm1,               7
-        psraw       xmm2,               7
-
-        packsswb    xmm1,               xmm2
-
-        movdqa      xmm6,               [rdx+16]
-        movdqa      xmm3,               [rdx+96]
-
-        pxor        xmm6,               [t80 GLOBAL]
-        pxor        xmm3,               [t80 GLOBAL]
-
-        paddsb      xmm6,               xmm1
-        psubsb      xmm3,               xmm1
-
-        pxor        xmm6,               [t80 GLOBAL]        ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        pxor        xmm3,               [t80 GLOBAL]        ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06
-%endmacro
-
 %macro MBV_TRANSPOSE 0
         movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
         movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
 
-        punpcklbw   xmm0,               xmm6                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpckhbw   xmm1,               xmm6                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+        punpcklbw   xmm0,               xmm7                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm1,               xmm7                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
 
         movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
         movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
@@ -1438,10 +1158,10 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
         punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
         punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
 
-        movdqa      xmm5,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
         punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
 
-        punpckhwd   xmm5,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        punpckhwd   xmm3,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
         movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
 
         punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
@@ -1450,7 +1170,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
         punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
 
-        movdqa      xmm6,               xmm3                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
         punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
 
         movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
@@ -1464,71 +1184,54 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
 %endmacro
 
 %macro MBV_WRITEBACK_1 0
-        movq        QWORD PTR [rsi],    xmm0
-        psrldq      xmm0,               8
+        movq        QWORD  PTR [rsi],   xmm0
+        movhps      MMWORD PTR [rdi],   xmm0
 
-        movq        QWORD PTR [rdi],    xmm0
+        movq        QWORD  PTR [rsi+2*rax], xmm6
+        movhps      MMWORD PTR [rdi+2*rax], xmm6
 
-        movq        QWORD PTR [rsi+2*rax], xmm6
-        psrldq      xmm6,               8
-
-        movq        QWORD PTR [rdi+2*rax], xmm6
-
-        movdqa      xmm0,               xmm5                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
         punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
 
-        punpckhdq   xmm5,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
-
-        movq        QWORD PTR [rsi+4*rax], xmm0
-        psrldq      xmm0,               8
+        punpckhdq   xmm3,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
 
-        movq        QWORD PTR [rdi+4*rax], xmm0
+        movq        QWORD  PTR [rsi+4*rax], xmm0
+        movhps      MMWORD PTR [rdi+4*rax], xmm0
 
-        movq        QWORD PTR [rsi+2*rcx], xmm5
-        psrldq      xmm5,               8
-
-        movq        QWORD PTR [rdi+2*rcx], xmm5
+        movq        QWORD  PTR [rsi+2*rcx], xmm3
+        movhps      MMWORD PTR [rdi+2*rcx], xmm3
 
         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
         punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
 
-        punpckhbw   xmm3,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+        punpckhbw   xmm5,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
         movdqa      xmm0,               xmm2
 
-        punpcklwd   xmm0,               xmm3                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
-        punpckhwd   xmm2,               xmm3                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+        punpckhwd   xmm2,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
 
-        movdqa      xmm3,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
         punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
 
-        punpckhdq   xmm3,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
 %endmacro
 
 %macro MBV_WRITEBACK_2 0
-        movq        QWORD PTR [rsi], xmm1
-        psrldq      xmm1,               8
-
-        movq        QWORD PTR [rdi], xmm1
-
-        movq        QWORD PTR [rsi+2*rax], xmm3
-        psrldq      xmm3,               8
+        movq        QWORD  PTR [rsi],   xmm1
+        movhps      MMWORD PTR [rdi],   xmm1
 
-        movq        QWORD PTR [rdi+2*rax], xmm3
+        movq        QWORD  PTR [rsi+2*rax], xmm5
+        movhps      MMWORD PTR [rdi+2*rax], xmm5
 
         movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
         punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
-
         punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
-        movq        QWORD PTR [rsi+4*rax], xmm1
-
-        psrldq      xmm1,               8
 
-        movq        QWORD PTR [rdi+4*rax], xmm1
+        movq        QWORD  PTR [rsi+4*rax], xmm1
+        movhps      MMWORD PTR [rdi+4*rax], xmm1
 
-        movq        QWORD PTR [rsi+2*rcx], xmm4
-        psrldq      xmm4,               8
-
-        movq        QWORD PTR [rdi+2*rcx], xmm4
+        movq        QWORD  PTR [rsi+2*rcx], xmm4
+        movhps      MMWORD PTR [rdi+2*rcx], xmm4
 %endmacro
 
 
@@ -1566,21 +1269,14 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
         lea         rcx,                [rax*2+rax]
 
         ; Transpose
-        TRANSPOSE_16X8_1
+        TRANSPOSE_16X8 1, 0
 
-        lea         rsi,        [rsi+rax*8]
-        lea         rdi,        [rdi+rax*8]
-        lea         rdx,        srct
-        TRANSPOSE_16X8_2 0
-
-        ; calculate filter mask
-        LFV_FILTER_MASK 0
-        ; calculate high edge variance
-        LFV_HEV_MASK
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK 0
 
         neg         rax
         ; start work on filters
-        MBV_FILTER
+        MB_FILTER_AND_WRITEBACK 2
 
         lea         rsi,                [rsi+rax*8]
         lea         rdi,                [rdi+rax*8]
@@ -1641,23 +1337,16 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
         lea         rcx,                [rax+2*rax]
 
-        ; Transpose
-        TRANSPOSE_16X8_1
-
-        ; XMM3 XMM4 XMM7 in use
-        mov         rsi,                arg(5)              ; v_ptr
-        lea         rsi,                [rsi - 4]
-        lea         rdi,                [rsi + rax]
         lea         rdx,                srct
-        TRANSPOSE_16X8_2 0
 
-        ; calculate filter mask
-        LFV_FILTER_MASK 0
-        ; calculate high edge variance
-        LFV_HEV_MASK
+        ; Transpose
+        TRANSPOSE_16X8 0, 0
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK 0
 
         ; start work on filters
-        MBV_FILTER
+        MB_FILTER_AND_WRITEBACK 2
 
         ; transpose and write back
         MBV_TRANSPOSE
@@ -1726,7 +1415,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
         psubusb     xmm0, xmm1              ; q1-=p1
         psubusb     xmm1, xmm4              ; p1-=q1
         por         xmm1, xmm0              ; abs(p1-q1)
-        pand        xmm1, [tfe GLOBAL]      ; set lsb of each byte to zero
+        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
         psrlw       xmm1, 1                 ; abs(p1-q1)/2
 
         movdqu      xmm5, [rsi+rax]         ; p0
@@ -1744,12 +1433,12 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
         pcmpeqb     xmm5, xmm3
 
         ; start work on filters
-        pxor        xmm2, [t80 GLOBAL]      ; p1 offset to convert to signed values
-        pxor        xmm7, [t80 GLOBAL]      ; q1 offset to convert to signed values
+        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
+        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
         psubsb      xmm2, xmm7              ; p1 - q1
 
-        pxor        xmm6, [t80 GLOBAL]      ; offset to convert to signed values
-        pxor        xmm0, [t80 GLOBAL]      ; offset to convert to signed values
+        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
+        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
         movdqa      xmm3, xmm0              ; q0
         psubsb      xmm0, xmm6              ; q0 - p0
         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
@@ -1758,7 +1447,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
         pand        xmm5, xmm2              ; mask filter values we don't care about
 
         ; do + 4 side
-        paddsb      xmm5, [t4 GLOBAL]       ; 3* (q0 - p0) + (p1 - q1) + 4
+        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
 
         movdqa      xmm0, xmm5              ; get a copy of filters
         psllw       xmm0, 8                 ; shift left 8
@@ -1771,11 +1460,11 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
         por         xmm0, xmm1              ; put the two together to get result
 
         psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [t80 GLOBAL]      ; unoffset
+        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
         movdqu      [rsi], xmm3             ; write back
 
         ; now do +3 side
-        psubsb      xmm5, [t1s GLOBAL]      ; +3 instead of +4
+        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
 
         movdqa      xmm0, xmm5              ; get a copy of filters
         psllw       xmm0, 8                 ; shift left 8
@@ -1787,7 +1476,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
 
 
         paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [t80 GLOBAL]      ; unoffset
+        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
         movdqu      [rsi+rax], xmm6         ; write back
 
     ; begin epilog
@@ -1907,7 +1596,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         psubusb     xmm7,       xmm0                            ; q1-=p1
         psubusb     xmm6,       xmm3                            ; p1-=q1
         por         xmm6,       xmm7                            ; abs(p1-q1)
-        pand        xmm6,       [tfe GLOBAL]                    ; set lsb of each byte to zero
+        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
         psrlw       xmm6,       1                               ; abs(p1-q1)/2
 
         movdqa      xmm5,       xmm1                            ; p0
@@ -1933,16 +1622,16 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         movdqa        t0,        xmm0
         movdqa        t1,        xmm3
 
-        pxor        xmm0,        [t80 GLOBAL]                   ; p1 offset to convert to signed values
-        pxor        xmm3,        [t80 GLOBAL]                   ; q1 offset to convert to signed values
+        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
+        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
 
         psubsb      xmm0,        xmm3                           ; p1 - q1
         movdqa      xmm6,        xmm1                           ; p0
 
         movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [t80 GLOBAL]                   ; offset to convert to signed values
+        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
 
-        pxor        xmm7,        [t80 GLOBAL]                   ; offset to convert to signed values
+        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
         movdqa      xmm3,        xmm7                           ; offseted ; q0
 
         psubsb      xmm7,        xmm6                           ; q0 - p0
@@ -1954,7 +1643,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         pand        xmm5,        xmm0                           ; mask filter values we don't care about
 
 
-        paddsb      xmm5,        [t4 GLOBAL]                    ;  3* (q0 - p0) + (p1 - q1) + 4
+        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
 
         movdqa      xmm0,        xmm5                           ; get a copy of filters
         psllw       xmm0,        8                              ; shift left 8
@@ -1969,10 +1658,10 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         por         xmm0,        xmm7                           ; put the two together to get result
 
         psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [t80 GLOBAL]                   ; unoffset   q0
+        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
 
         ; now do +3 side
-        psubsb      xmm5,        [t1s GLOBAL]                   ; +3 instead of +4
+        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
         movdqa      xmm0,        xmm5                           ; get a copy of filters
 
         psllw       xmm0,        8                              ; shift left 8
@@ -1985,7 +1674,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
         por         xmm0,        xmm5                           ; put the two together to get result
 
         paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [t80 GLOBAL]                   ; unoffset   p0
+        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
 
         movdqa      xmm0,        t0                             ; p1
         movdqa      xmm4,        t1                             ; q1
@@ -2084,12 +1773,6 @@ align 16
 ones:
     times 8 dw 0x0001
 align 16
-s27:
-    times 8 dw 0x1b00
-align 16
-s18:
-    times 8 dw 0x1200
-align 16
 s9:
     times 8 dw 0x0900
 align 16
diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index 3ff8c4e12..93107e179 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -40,7 +40,7 @@ extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
 
 #if HAVE_MMX
-// Horizontal MB filtering
+/* Horizontal MB filtering */
 void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -66,7 +66,7 @@ void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 }
 
 
-// Vertical MB Filtering
+/* Vertical MB Filtering */
 void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -92,7 +92,7 @@ void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 }
 
 
-// Horizontal B Filtering
+/* Horizontal B Filtering */
 void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -122,7 +122,7 @@ void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 }
 
 
-// Vertical B Filtering
+/* Vertical B Filtering */
 void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -153,7 +153,7 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
 #endif
 
 
-// Horizontal MB filtering
+/* Horizontal MB filtering */
 #if HAVE_SSE2
 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
@@ -177,7 +177,7 @@ void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
 }
 
 
-// Vertical MB Filtering
+/* Vertical MB Filtering */
 void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -200,7 +200,7 @@ void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
 }
 
 
-// Horizontal B Filtering
+/* Horizontal B Filtering */
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
@@ -227,7 +227,7 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
 }
 
 
-// Vertical B Filtering
+/* Vertical B Filtering */
 void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
 {
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index 349ac0d3b..787e83268 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -37,16 +37,16 @@ sym(vp8_post_proc_down_and_across_mmx):
 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
     ; move the global rd onto the stack, since we don't have enough registers
     ; to do PIC addressing
-    movq        mm0, [rd GLOBAL]
+    movq        mm0, [GLOBAL(rd)]
     sub         rsp, 8
     movq        [rsp], mm0
 %define RD [rsp]
 %else
-%define RD [rd GLOBAL]
+%define RD [GLOBAL(rd)]
 %endif
 
         push        rbx
-        lea         rbx, [Blur GLOBAL]
+        lea         rbx, [GLOBAL(Blur)]
         movd        mm2, dword ptr arg(6) ;flimit
         punpcklwd   mm2, mm2
         punpckldq   mm2, mm2
@@ -286,7 +286,7 @@ sym(vp8_mbpost_proc_down_mmx):
 %define flimit2 [rsp+128]
 
 %if ABI_IS_32BIT=0
-    lea         r8,       [sym(vp8_rv) GLOBAL]
+    lea         r8,       [GLOBAL(sym(vp8_rv))]
 %endif
 
     ;rows +=8;
@@ -404,7 +404,7 @@ loop_row:
             and         rcx,        127
 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
             push        rax
-            lea         rax,        [sym(vp8_rv) GLOBAL]
+            lea         rax,        [GLOBAL(sym(vp8_rv))]
             movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
             pop         rax
 %elif ABI_IS_32BIT=0
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index 276f208ff..30b4bf53a 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -36,12 +36,12 @@ sym(vp8_post_proc_down_and_across_xmm):
     ALIGN_STACK 16, rax
     ; move the global rd onto the stack, since we don't have enough registers
     ; to do PIC addressing
-    movdqa      xmm0, [rd42 GLOBAL]
+    movdqa      xmm0, [GLOBAL(rd42)]
     sub         rsp, 16
     movdqa      [rsp], xmm0
 %define RD42 [rsp]
 %else
-%define RD42 [rd42 GLOBAL]
+%define RD42 [GLOBAL(rd42)]
 %endif
 
 
@@ -275,7 +275,7 @@ sym(vp8_mbpost_proc_down_xmm):
 %define flimit4 [rsp+128]
 
 %if ABI_IS_32BIT=0
-    lea         r8,       [sym(vp8_rv) GLOBAL]
+    lea         r8,       [GLOBAL(sym(vp8_rv))]
 %endif
 
     ;rows +=8;
@@ -393,7 +393,7 @@ loop_row:
             and         rcx,        127
 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
             push        rax
-            lea         rax,        [sym(vp8_rv) GLOBAL]
+            lea         rax,        [GLOBAL(sym(vp8_rv))]
             movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
             pop         rax
 %elif ABI_IS_32BIT=0
@@ -579,7 +579,7 @@ nextcol4:
             punpcklwd   xmm1,   xmm0
 
             paddd       xmm1,   xmm6
-            paddd       xmm1,   [four8s GLOBAL]
+            paddd       xmm1,   [GLOBAL(four8s)]
 
             psrad       xmm1,   4
             packssdw    xmm1,   xmm0
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 06db0c6a0..23ed4e208 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -84,7 +84,7 @@ nextrow:
         pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
         paddsw      mm3,    mm5              ; mm3 += mm5
 
-        paddsw      mm3,    [rd GLOBAL]               ; mm3 += round value
+        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
         psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
         packuswb    mm3,    mm0              ; pack and unpack to saturate
         punpcklbw   mm3,    mm0              ;
@@ -136,7 +136,7 @@ sym(vp8_filter_block1d_v6_mmx):
     push        rdi
     ; end prolog
 
-        movq      mm5, [rd GLOBAL]
+        movq      mm5, [GLOBAL(rd)]
         push        rbx
         mov         rbx, arg(6) ;vp8_filter
         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
@@ -225,7 +225,7 @@ sym(vp8_filter_block1dc_v6_mmx):
     push        rdi
     ; end prolog
 
-        movq      mm5, [rd GLOBAL]
+        movq      mm5, [GLOBAL(rd)]
         push        rbx
         mov         rbx, arg(7) ;vp8_filter
         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
@@ -320,7 +320,7 @@ sym(vp8_bilinear_predict8x8_mmx):
         mov         rdi,        arg(4) ;dst_ptr           ;
 
         shl         rax,        5 ; offset * 32
-        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
 
         add         rax,        rcx ; HFilter
         mov         rsi,        arg(0) ;src_ptr              ;
@@ -363,10 +363,10 @@ sym(vp8_bilinear_predict8x8_mmx):
         paddw       mm3,        mm5                 ;
         paddw       mm4,        mm6                 ;
 
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       mm4,        [rd GLOBAL]                  ;
+        paddw       mm4,        [GLOBAL(rd)]                 ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;
 
         movq        mm7,        mm3                 ;
@@ -404,10 +404,10 @@ next_row_8x8:
         pmullw      mm5,        [rax]               ;
         pmullw      mm6,        [rax]               ;
 
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       mm4,        [rd GLOBAL]                  ;
+        paddw       mm4,        [GLOBAL(rd)]                 ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;
 
         movq        mm7,        mm3                 ;
@@ -421,10 +421,10 @@ next_row_8x8:
         paddw       mm4,        mm6                 ;
 
 
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       mm4,        [rd GLOBAL]                  ;
+        paddw       mm4,        [GLOBAL(rd)]                 ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;
 
         packuswb    mm3,        mm4
@@ -476,7 +476,7 @@ sym(vp8_bilinear_predict8x4_mmx):
         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;
 
-        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
         shl         rax,        5
 
         mov         rsi,        arg(0) ;src_ptr              ;
@@ -518,10 +518,10 @@ sym(vp8_bilinear_predict8x4_mmx):
         paddw       mm3,        mm5                 ;
         paddw       mm4,        mm6                 ;
 
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       mm4,        [rd GLOBAL]                  ;
+        paddw       mm4,        [GLOBAL(rd)]                 ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;
 
         movq        mm7,        mm3                 ;
@@ -559,10 +559,10 @@ next_row_8x4:
         pmullw      mm5,        [rax]               ;
         pmullw      mm6,        [rax]               ;
 
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       mm4,        [rd GLOBAL]                  ;
+        paddw       mm4,        [GLOBAL(rd)]                 ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;
 
         movq        mm7,        mm3                 ;
@@ -576,10 +576,10 @@ next_row_8x4:
         paddw       mm4,        mm6                 ;
 
 
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       mm4,        [rd GLOBAL]                  ;
+        paddw       mm4,        [GLOBAL(rd)]                 ;
         psraw       mm4,        VP8_FILTER_SHIFT        ;
 
         packuswb    mm3,        mm4
@@ -631,7 +631,7 @@ sym(vp8_bilinear_predict4x4_mmx):
         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;
 
-        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
         shl         rax,        5
 
         add         rax,        rcx ; HFilter
@@ -662,7 +662,7 @@ sym(vp8_bilinear_predict4x4_mmx):
         pmullw      mm5,        mm2                 ;
 
         paddw       mm3,        mm5                 ;
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 
@@ -686,7 +686,7 @@ next_row_4x4:
         punpcklbw   mm5,        mm0                 ;
 
         pmullw      mm5,        [rax]               ;
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
 
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
         movq        mm7,        mm3                 ;
@@ -697,7 +697,7 @@ next_row_4x4:
         paddw       mm3,        mm5                 ;
 
 
-        paddw       mm3,        [rd GLOBAL]                  ; xmm3 += round value
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
 
         packuswb    mm3,        mm0
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index 2385abfd0..b87cad259 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -107,7 +107,7 @@ filter_block1d8_h6_rowloop:
         paddsw      xmm4,       xmm6
 
         paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [rd GLOBAL]
+        paddsw      xmm4,       [GLOBAL(rd)]
 
         psraw       xmm4,       7
 
@@ -231,7 +231,7 @@ filter_block1d16_h6_sse2_rowloop:
         paddsw      xmm4,       xmm6
 
         paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [rd GLOBAL]
+        paddsw      xmm4,       [GLOBAL(rd)]
 
         psraw       xmm4,       7
 
@@ -284,7 +284,7 @@ filter_block1d16_h6_sse2_rowloop:
         paddsw      xmm4,       xmm6
 
         paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [rd GLOBAL]
+        paddsw      xmm4,       [GLOBAL(rd)]
 
         psraw       xmm4,       7
 
@@ -351,7 +351,7 @@ sym(vp8_filter_block1d8_v6_sse2):
         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
         pxor        xmm0,       xmm0                        ; clear xmm0
 
-        movdqa      xmm7,       XMMWORD PTR [rd GLOBAL]
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
 %if ABI_IS_32BIT=0
         movsxd      r8,         dword ptr arg(2) ; dst_ptich
 %endif
@@ -489,7 +489,7 @@ vp8_filter_block1d16_v6_sse2_loop:
         pmullw      xmm5,       [rax + 80]
         pmullw      xmm6,       [rax + 80]
 
-        movdqa      xmm7,       XMMWORD PTR [rd GLOBAL]
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
         pxor        xmm0,       xmm0                        ; clear xmm0
 
         paddsw      xmm1,       xmm3
@@ -608,7 +608,7 @@ filter_block1d8_h6_only_rowloop:
         paddsw      xmm4,       xmm6
 
         paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [rd GLOBAL]
+        paddsw      xmm4,       [GLOBAL(rd)]
 
         psraw       xmm4,       7
 
@@ -723,7 +723,7 @@ filter_block1d16_h6_only_sse2_rowloop:
         paddsw      xmm4,       xmm6
 
         paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [rd GLOBAL]
+        paddsw      xmm4,       [GLOBAL(rd)]
 
         psraw       xmm4,       7
 
@@ -773,7 +773,7 @@ filter_block1d16_h6_only_sse2_rowloop:
         paddsw      xmm4,       xmm6
 
         paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [rd GLOBAL]
+        paddsw      xmm4,       [GLOBAL(rd)]
 
         psraw       xmm4,       7
 
@@ -832,7 +832,7 @@ sym(vp8_filter_block1d8_v6_only_sse2):
 
         pxor        xmm0,       xmm0                        ; clear xmm0
 
-        movdqa      xmm7,       XMMWORD PTR [rd GLOBAL]
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
 %if ABI_IS_32BIT=0
         movsxd      r8,         dword ptr arg(3) ; dst_ptich
 %endif
@@ -978,7 +978,7 @@ sym(vp8_bilinear_predict16x16_sse2):
     ;const short *HFilter = bilinear_filters_mmx[xoffset]
     ;const short *VFilter = bilinear_filters_mmx[yoffset]
 
-        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
         movsxd      rax,        dword ptr arg(2) ;xoffset
 
         cmp         rax,        0      ;skip first_pass filter if xoffset=0
@@ -1033,10 +1033,10 @@ sym(vp8_bilinear_predict16x16_sse2):
         paddw       xmm3,       xmm5
         paddw       xmm4,       xmm6
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       xmm4,       [rd GLOBAL]
+        paddw       xmm4,       [GLOBAL(rd)]
         psraw       xmm4,       VP8_FILTER_SHIFT
 
         movdqa      xmm7,       xmm3
@@ -1074,10 +1074,10 @@ next_row:
         pmullw      xmm5,       [rax]
         pmullw      xmm6,       [rax]
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       xmm4,       [rd GLOBAL]
+        paddw       xmm4,       [GLOBAL(rd)]
         psraw       xmm4,       VP8_FILTER_SHIFT
 
         movdqa      xmm7,       xmm3
@@ -1089,10 +1089,10 @@ next_row:
         paddw       xmm3,       xmm5
         paddw       xmm4,       xmm6
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       xmm4,       [rd GLOBAL]
+        paddw       xmm4,       [GLOBAL(rd)]
         psraw       xmm4,       VP8_FILTER_SHIFT
 
         packuswb    xmm3,       xmm4
@@ -1154,10 +1154,10 @@ next_row_spo:
         paddw       xmm3,       xmm5
         paddw       xmm4,       xmm6
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       xmm4,       [rd GLOBAL]
+        paddw       xmm4,       [GLOBAL(rd)]
         psraw       xmm4,       VP8_FILTER_SHIFT
 
         packuswb    xmm3,       xmm4
@@ -1198,10 +1198,10 @@ next_row_fpo:
         paddw       xmm3,       xmm5
         paddw       xmm4,       xmm6
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
 
-        paddw       xmm4,       [rd GLOBAL]
+        paddw       xmm4,       [GLOBAL(rd)]
         psraw       xmm4,       VP8_FILTER_SHIFT
 
         packuswb    xmm3,       xmm4
@@ -1249,7 +1249,7 @@ sym(vp8_bilinear_predict8x8_sse2):
 
     ;const short *HFilter = bilinear_filters_mmx[xoffset]
     ;const short *VFilter = bilinear_filters_mmx[yoffset]
-        lea         rcx,        [sym(vp8_bilinear_filters_mmx) GLOBAL]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
 
         mov         rsi,        arg(0) ;src_ptr
         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
@@ -1315,7 +1315,7 @@ sym(vp8_bilinear_predict8x8_sse2):
 
         paddw       xmm3,       xmm4
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
 
         movdqa      xmm7,       xmm3
@@ -1334,7 +1334,7 @@ next_row8x8:
         paddw       xmm3,       xmm4
         pmullw      xmm7,       xmm5
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
 
         movdqa      xmm4,       xmm3
@@ -1344,7 +1344,7 @@ next_row8x8:
 
         movdqa      xmm7,       xmm4
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
 
         packuswb    xmm3,       xmm0
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index f7209ccc0..7f6fd93e4 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -48,9 +48,9 @@ sym(vp8_filter_block1d8_h6_ssse3):
     xor         rsi, rsi
     shl         rdx, 4
 
-    movdqa      xmm7, [rd GLOBAL]
+    movdqa      xmm7, [GLOBAL(rd)]
 
-    lea         rax, [k0_k5 GLOBAL]
+    lea         rax, [GLOBAL(k0_k5)]
     add         rax, rdx
     mov         rdi, arg(2)             ;output_ptr
 
@@ -70,27 +70,35 @@ sym(vp8_filter_block1d8_h6_ssse3):
     sub         rdi, rdx
 ;xmm3 free
 filter_block1d8_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
-    movdqa      xmm1, xmm0
-    pshufb      xmm0, [shuf1b GLOBAL]
+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
 
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [shuf2b GLOBAL]
-    pmaddubsw   xmm0, xmm4
-    pmaddubsw   xmm1, xmm5
+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
 
-    pshufb      xmm2, [shuf3b GLOBAL]
-    add         rdi, rdx
-    pmaddubsw   xmm2, xmm6
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+    pmaddubsw   xmm1,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6
 
     lea         rsi,    [rsi + rax]
     dec         rcx
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm2,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0
 
     movq        MMWORD Ptr [rdi], xmm0
     jnz         filter_block1d8_h6_rowloop_ssse3
@@ -107,8 +115,8 @@ vp8_filter_block1d8_h4_ssse3:
     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
 
-    movdqa      xmm3, XMMWORD PTR [shuf2b GLOBAL]
-    movdqa      xmm4, XMMWORD PTR [shuf3b GLOBAL]
+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
 
     mov         rsi, arg(0)             ;src_ptr
 
@@ -118,24 +126,33 @@ vp8_filter_block1d8_h4_ssse3:
     movsxd      rdx, dword ptr arg(3)   ;output_pitch
 
     sub         rdi, rdx
-;xmm3 free
+
 filter_block1d8_h4_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
-    movdqa      xmm2, xmm0
-    pshufb      xmm0, xmm3 ;[shuf2b GLOBAL]
-    pshufb      xmm2, xmm4 ;[shuf3b GLOBAL]
+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
 
-    pmaddubsw   xmm0, xmm5
-    add         rdi, rdx
-    pmaddubsw   xmm2, xmm6
+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm2,   xmm0
+    pshufb      xmm0,   xmm3
+
+    pshufb      xmm2,   xmm4
+    pmaddubsw   xmm0,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6
 
     lea         rsi,    [rsi + rax]
     dec         rcx
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
+
+    paddsw      xmm0,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0
 
     movq        MMWORD Ptr [rdi], xmm0
 
@@ -168,74 +185,88 @@ sym(vp8_filter_block1d16_h6_ssse3):
     push        rdi
     ; end prolog
 
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    movsxd      rdx, DWORD PTR arg(5)           ;table index
     xor         rsi, rsi
     shl         rdx, 4      ;
 
-    lea         rax, [k0_k5 GLOBAL]
+    lea         rax, [GLOBAL(k0_k5)]
     add         rax, rdx
 
-    mov         rdi, arg(2)             ;output_ptr
-    movdqa      xmm7, [rd GLOBAL]
+    mov         rdi, arg(2)                     ;output_ptr
 
 ;;
 ;;    cmp         esi, DWORD PTR [rax]
 ;;    je          vp8_filter_block1d16_h4_ssse3
 
-    mov         rsi, arg(0)             ;src_ptr
+    mov         rsi, arg(0)                     ;src_ptr
 
     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
 
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)           ;output_height
+    movsxd      rdx, dword ptr arg(3)           ;output_pitch
 
 filter_block1d16_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
-    movdqa      xmm1, xmm0
-    pshufb      xmm0, [shuf1b GLOBAL]
-    movdqa      xmm2, xmm1
-    pmaddubsw   xmm0, xmm4
-    pshufb      xmm1, [shuf2b GLOBAL]
-    pshufb      xmm2, [shuf3b GLOBAL]
-    pmaddubsw   xmm1, xmm5
+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
 
-    movdqu      xmm3,   XMMWORD PTR [rsi + 6]
+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
 
-    pmaddubsw   xmm2, xmm6
-    paddsw      xmm0, xmm1
-    movdqa      xmm1, xmm3
-    pshufb      xmm3, [shuf1b GLOBAL]
-    paddsw      xmm0, xmm7
-    pmaddubsw   xmm3, xmm4
-    paddsw      xmm0, xmm2
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [shuf2b GLOBAL]
-    pshufb      xmm2, [shuf3b GLOBAL]
-    pmaddubsw   xmm1, xmm5
-    pmaddubsw   xmm2, xmm6
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+    movq        xmm3,   MMWORD PTR [rsi +  6]
+
+    pmaddubsw   xmm1,   xmm5
+    movq        xmm7,   MMWORD PTR [rsi + 11]
+
+    pmaddubsw   xmm2,   xmm6
+    punpcklbw   xmm3,   xmm7
+
+    paddsw      xmm0,   xmm1
+    movdqa      xmm1,   xmm3
+
+    pmaddubsw   xmm3,   xmm4
+    paddsw      xmm0,   xmm2
+
+    movdqa      xmm2,   xmm1
+    paddsw      xmm0,   [GLOBAL(rd)]
+
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+
+    psraw       xmm0,   7
+    pmaddubsw   xmm1,   xmm5
+
+    pmaddubsw   xmm2,   xmm6
+    packuswb    xmm0,   xmm0
 
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
     lea         rsi,    [rsi + rax]
-    paddsw      xmm3, xmm1
-    paddsw      xmm3, xmm7
-    paddsw      xmm3, xmm2
-    psraw       xmm3, 7
-    packuswb    xmm3, xmm3
+    paddsw      xmm3,   xmm1
 
-    punpcklqdq  xmm0, xmm3
+    paddsw      xmm3,   xmm2
+
+    paddsw      xmm3,   [GLOBAL(rd)]
+
+    psraw       xmm3,   7
+
+    packuswb    xmm3,   xmm3
+
+    punpcklqdq  xmm0,   xmm3
 
     movdqa      XMMWORD Ptr [rdi], xmm0
 
-    add         rdi, rdx
+    lea         rdi,    [rdi + rdx]
     dec         rcx
     jnz         filter_block1d16_h6_rowloop_ssse3
 
-
     ; begin epilog
     pop rdi
     pop rsi
@@ -257,18 +288,18 @@ filter_block1d16_h4_rowloop_ssse3:
     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
 
     movdqa      xmm2, xmm1
-    pshufb      xmm1, [shuf2b GLOBAL]
-    pshufb      xmm2, [shuf3b GLOBAL]
+    pshufb      xmm1, [GLOBAL(shuf2b)]
+    pshufb      xmm2, [GLOBAL(shuf3b)]
     pmaddubsw   xmm1, xmm5
 
     movdqu      xmm3,   XMMWORD PTR [rsi + 6]
 
     pmaddubsw   xmm2, xmm6
     movdqa      xmm0, xmm3
-    pshufb      xmm3, [shuf3b GLOBAL]
-    pshufb      xmm0, [shuf2b GLOBAL]
+    pshufb      xmm3, [GLOBAL(shuf3b)]
+    pshufb      xmm0, [GLOBAL(shuf2b)]
 
-    paddsw      xmm1, xmm7
+    paddsw      xmm1, [GLOBAL(rd)]
     paddsw      xmm1, xmm2
 
     pmaddubsw   xmm0, xmm5
@@ -278,7 +309,7 @@ filter_block1d16_h4_rowloop_ssse3:
     packuswb    xmm1, xmm1
     lea         rsi,    [rsi + rax]
     paddsw      xmm3, xmm0
-    paddsw      xmm3, xmm7
+    paddsw      xmm3, [GLOBAL(rd)]
     psraw       xmm3, 7
     packuswb    xmm3, xmm3
 
@@ -322,9 +353,9 @@ sym(vp8_filter_block1d4_h6_ssse3):
     xor         rsi, rsi
     shl         rdx, 4      ;
 
-    lea         rax, [k0_k5 GLOBAL]
+    lea         rax, [GLOBAL(k0_k5)]
     add         rax, rdx
-    movdqa      xmm7, [rd GLOBAL]
+    movdqa      xmm7, [GLOBAL(rd)]
 
     cmp         esi, DWORD PTR [rax]
     je          vp8_filter_block1d4_h4_ssse3
@@ -345,12 +376,12 @@ filter_block1d4_h6_rowloop_ssse3:
     movdqu      xmm0,   XMMWORD PTR [rsi - 2]
 
     movdqa      xmm1, xmm0
-    pshufb      xmm0, [shuf1b GLOBAL]
+    pshufb      xmm0, [GLOBAL(shuf1b)]
 
     movdqa      xmm2, xmm1
-    pshufb      xmm1, [shuf2b GLOBAL]
+    pshufb      xmm1, [GLOBAL(shuf2b)]
     pmaddubsw   xmm0, xmm4
-    pshufb      xmm2, [shuf3b GLOBAL]
+    pshufb      xmm2, [GLOBAL(shuf3b)]
     pmaddubsw   xmm1, xmm5
 
 ;--
@@ -382,8 +413,8 @@ filter_block1d4_h6_rowloop_ssse3:
 vp8_filter_block1d4_h4_ssse3:
     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm0, XMMWORD PTR [shuf2b GLOBAL]
-    movdqa      xmm3, XMMWORD PTR [shuf3b GLOBAL]
+    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
 
     mov         rsi, arg(0)             ;src_ptr
     mov         rdi, arg(2)             ;output_ptr
@@ -396,8 +427,8 @@ filter_block1d4_h4_rowloop_ssse3:
     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
 
     movdqa      xmm2, xmm1
-    pshufb      xmm1, xmm0 ;;[shuf2b GLOBAL]
-    pshufb      xmm2, xmm3 ;;[shuf3b GLOBAL]
+    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
     pmaddubsw   xmm1, xmm5
 
 ;--
@@ -449,7 +480,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
     xor         rsi, rsi
     shl         rdx, 4      ;
 
-    lea         rax, [k0_k5 GLOBAL]
+    lea         rax, [GLOBAL(k0_k5)]
     add         rax, rdx
 
     cmp         esi, DWORD PTR [rax]
@@ -490,7 +521,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
 
     paddsw      xmm2, xmm3
     paddsw      xmm2, xmm1
-    paddsw      xmm2, [rd GLOBAL]
+    paddsw      xmm2, [GLOBAL(rd)]
     psraw       xmm2, 7
     packuswb    xmm2, xmm2
 
@@ -517,7 +548,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
 ;--
     paddsw      xmm2, xmm3
     paddsw      xmm2, xmm1
-    paddsw      xmm2, [rd GLOBAL]
+    paddsw      xmm2, [GLOBAL(rd)]
     psraw       xmm2, 7
     packuswb    xmm2, xmm2
 
@@ -570,7 +601,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
 
-    paddsw      xmm2, [rd GLOBAL]
+    paddsw      xmm2, [GLOBAL(rd)]
     paddsw      xmm2, xmm3
     psraw       xmm2, 7
     packuswb    xmm2, xmm2
@@ -581,7 +612,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
     pmaddubsw   xmm1, xmm6
     pmaddubsw   xmm5, xmm7
 
-    movdqa      xmm4, [rd GLOBAL]
+    movdqa      xmm4, [GLOBAL(rd)]
     add         rsi,  rdx
     add         rax,  rdx
 ;--
@@ -634,7 +665,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
     xor         rsi, rsi
     shl         rdx, 4      ;
 
-    lea         rax, [k0_k5 GLOBAL]
+    lea         rax, [GLOBAL(k0_k5)]
     add         rax, rdx
 
     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
@@ -667,7 +698,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
     punpcklbw   xmm3, xmm0                  ;C E
 
     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-    movdqa      xmm4, [rd GLOBAL]
+    movdqa      xmm4, [GLOBAL(rd)]
 
     pmaddubsw   xmm3, xmm6
     punpcklbw   xmm1, xmm0                  ;A F
@@ -704,7 +735,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
 vp8_filter_block1d8_v4_ssse3:
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm5, [rd GLOBAL]
+    movdqa      xmm5, [GLOBAL(rd)]
 
     mov         rsi, arg(0)             ;src_ptr
 
@@ -771,7 +802,7 @@ sym(vp8_filter_block1d4_v6_ssse3):
     xor         rsi, rsi
     shl         rdx, 4      ;
 
-    lea         rax, [k0_k5 GLOBAL]
+    lea         rax, [GLOBAL(k0_k5)]
     add         rax, rdx
 
     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
@@ -805,7 +836,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
 
     movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
 
-    movq        mm4, [rd GLOBAL]
+    movq        mm4, [GLOBAL(rd)]
 
     pmaddubsw   mm3, mm6
     punpcklbw   mm1, mm0                  ;A F
@@ -842,7 +873,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
 vp8_filter_block1d4_v4_ssse3:
     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-    movq        mm5, MMWORD PTR [rd GLOBAL]
+    movq        mm5, MMWORD PTR [GLOBAL(rd)]
 
     mov         rsi, arg(0)             ;src_ptr
 
@@ -907,7 +938,7 @@ sym(vp8_bilinear_predict16x16_ssse3):
     push        rdi
     ; end prolog
 
-        lea         rcx,        [vp8_bilinear_filters_ssse3 GLOBAL]
+        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
         movsxd      rax,        dword ptr arg(2)    ; xoffset
 
         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
@@ -939,46 +970,49 @@ sym(vp8_bilinear_predict16x16_ssse3):
 %if ABI_IS_32BIT=0
         movsxd      r8,         dword ptr arg(5)    ; dst_pitch
 %endif
-        movdqu      xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
 
-        movdqa      xmm4,       xmm3
+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
 
-        movdqu      xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16
         lea         rsi,        [rsi + rdx]         ; next line
 
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
         pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
 
-        punpckhbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
         pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
 
-        paddw       xmm4,       [rd GLOBAL]         ; xmm4 += round value
+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
 
         movdqa      xmm7,       xmm3
         packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
 
 .next_row:
-        movdqu      xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
 
-        movdqa      xmm4,       xmm6
+        punpcklbw   xmm6,       xmm5
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
 
-        movdqu      xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
         lea         rsi,        [rsi + rdx]         ; next line
 
-        punpcklbw   xmm6,       xmm5
         pmaddubsw   xmm6,       xmm1
 
-        punpckhbw   xmm4,       xmm5
+        punpcklbw   xmm4,       xmm5
         pmaddubsw   xmm4,       xmm1
 
-        paddw       xmm6,       [rd GLOBAL]         ; xmm6 += round value
+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
 
-        paddw       xmm4,       [rd GLOBAL]         ; xmm4 += round value
+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
 
         packuswb    xmm6,       xmm4
@@ -990,10 +1024,10 @@ sym(vp8_bilinear_predict16x16_ssse3):
         punpckhbw   xmm7,       xmm6
         pmaddubsw   xmm7,       xmm2
 
-        paddw       xmm5,       [rd GLOBAL]         ; xmm5 += round value
+        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
         psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
 
-        paddw       xmm7,       [rd GLOBAL]         ; xmm7 += round value
+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
 
         packuswb    xmm5,       xmm7
@@ -1027,49 +1061,51 @@ b16x16_sp_only:
         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
 
         ; get the first horizontal line done
-        movdqu      xmm2,       [rsi]               ; load row 0
+        movq        xmm4,       [rsi]               ; load row 0
+        movq        xmm2,       [rsi + 8]           ; load row 0
 
         lea         rsi,        [rsi + rax]         ; next line
 .next_row:
-        movdqu      xmm3,       [rsi]               ; load row + 1
+        movq        xmm3,       [rsi]               ; load row + 1
+        movq        xmm5,       [rsi + 8]           ; load row + 1
 
-        movdqu      xmm4,       xmm2
         punpcklbw   xmm4,       xmm3
+        punpcklbw   xmm2,       xmm5
 
         pmaddubsw   xmm4,       xmm1
-        movdqu      xmm7,       [rsi + rax]         ; load row + 2
-
-        punpckhbw   xmm2,       xmm3
-        movdqu      xmm6,       xmm3
+        movq        xmm7,       [rsi + rax]         ; load row + 2
 
         pmaddubsw   xmm2,       xmm1
-        punpcklbw   xmm6,       xmm7
+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
 
-        paddw       xmm4,       [rd GLOBAL]
-        pmaddubsw   xmm6,       xmm1
-
-        psraw       xmm4,       VP8_FILTER_SHIFT
-        punpckhbw   xmm3,       xmm7
+        punpcklbw   xmm3,       xmm7
+        punpcklbw   xmm5,       xmm6
 
-        paddw       xmm2,       [rd GLOBAL]
         pmaddubsw   xmm3,       xmm1
+        paddw       xmm4,       [GLOBAL(rd)]
 
+        pmaddubsw   xmm5,       xmm1
+        paddw       xmm2,       [GLOBAL(rd)]
+
+        psraw       xmm4,       VP8_FILTER_SHIFT
         psraw       xmm2,       VP8_FILTER_SHIFT
-        paddw       xmm6,       [rd GLOBAL]
 
         packuswb    xmm4,       xmm2
-        psraw       xmm6,       VP8_FILTER_SHIFT
+        paddw       xmm3,       [GLOBAL(rd)]
 
         movdqa      [rdi],      xmm4                ; store row 0
-        paddw       xmm3,       [rd GLOBAL]
+        paddw       xmm5,       [GLOBAL(rd)]
 
         psraw       xmm3,       VP8_FILTER_SHIFT
-        lea         rsi,        [rsi + 2*rax]
+        psraw       xmm5,       VP8_FILTER_SHIFT
 
-        packuswb    xmm6,       xmm3
-        movdqa      xmm2,       xmm7
+        packuswb    xmm3,       xmm5
+        movdqa      xmm4,       xmm7
 
-        movdqa      [rdi + rdx],xmm6                ; store row 1
+        movdqa      [rdi + rdx],xmm3                ; store row 1
+        lea         rsi,        [rsi + 2*rax]
+
+        movdqa      xmm2,       xmm6
         lea         rdi,        [rdi + 2*rdx]
 
         cmp         rdi,        rcx
@@ -1083,43 +1119,46 @@ b16x16_fp_only:
         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
 
 .next_row:
-        movdqu      xmm2,       [rsi]               ; row 0
-        movdqa      xmm3,       xmm2
-
-        movdqu      xmm4,       [rsi + 1]           ; row 0 + 1
-        lea         rsi,        [rsi + rax]         ; next line
+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
 
         punpcklbw   xmm2,       xmm4
-        movdqu      xmm5,       [rsi]               ; row 1
+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
 
         pmaddubsw   xmm2,       xmm1
-        movdqa      xmm6,       xmm5
+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
 
-        punpckhbw   xmm3,       xmm4
-        movdqu      xmm7,       [rsi + 1]           ; row 1 + 1
+        lea         rsi,        [rsi + rax]         ; next line
+        punpcklbw   xmm3,       xmm4
 
         pmaddubsw   xmm3,       xmm1
-        paddw       xmm2,       [rd GLOBAL]
+        movq        xmm5,       [rsi]
 
+        paddw       xmm2,       [GLOBAL(rd)]
+        movq        xmm7,       [rsi+1]
+
+        movq        xmm6,       [rsi+8]
         psraw       xmm2,       VP8_FILTER_SHIFT
+
         punpcklbw   xmm5,       xmm7
+        movq        xmm7,       [rsi+9]
 
-        paddw       xmm3,       [rd GLOBAL]
+        paddw       xmm3,       [GLOBAL(rd)]
         pmaddubsw   xmm5,       xmm1
 
         psraw       xmm3,       VP8_FILTER_SHIFT
-        punpckhbw   xmm6,       xmm7
+        punpcklbw   xmm6,       xmm7
 
         packuswb    xmm2,       xmm3
         pmaddubsw   xmm6,       xmm1
 
         movdqa      [rdi],      xmm2                ; store the results in the destination
-        paddw       xmm5,       [rd GLOBAL]
+        paddw       xmm5,       [GLOBAL(rd)]
 
         lea         rdi,        [rdi + rdx]         ; dst_pitch
         psraw       xmm5,       VP8_FILTER_SHIFT
 
-        paddw       xmm6,       [rd GLOBAL]
+        paddw       xmm6,       [GLOBAL(rd)]
         psraw       xmm6,       VP8_FILTER_SHIFT
 
         packuswb    xmm5,       xmm6
@@ -1165,7 +1204,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
     ALIGN_STACK 16, rax
     sub         rsp, 144                         ; reserve 144 bytes
 
-        lea         rcx,        [vp8_bilinear_filters_ssse3 GLOBAL]
+        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
 
         mov         rsi,        arg(0) ;src_ptr
         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
@@ -1230,7 +1269,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
         pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
 
-        paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
 
         movdqa      xmm7,       xmm3
@@ -1247,7 +1286,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
         punpcklbw   xmm6,       xmm5
         pmaddubsw   xmm6,       xmm0
 
-        paddw       xmm6,       [rd GLOBAL]         ; xmm6 += round value
+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
 
         packuswb    xmm6,       xmm6
@@ -1255,7 +1294,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
         punpcklbw   xmm7,       xmm6
         pmaddubsw   xmm7,       xmm1
 
-        paddw       xmm7,       [rd GLOBAL]         ; xmm7 += round value
+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
 
         packuswb    xmm7,       xmm7
@@ -1308,21 +1347,21 @@ b8x8_sp_only:
         punpcklbw   xmm6,       xmm7
 
         pmaddubsw   xmm6,       xmm0
-        paddw       xmm1,       [rd GLOBAL]
+        paddw       xmm1,       [GLOBAL(rd)]
 
-        paddw       xmm2,       [rd GLOBAL]
+        paddw       xmm2,       [GLOBAL(rd)]
         psraw       xmm1,       VP8_FILTER_SHIFT
 
-        paddw       xmm3,       [rd GLOBAL]
+        paddw       xmm3,       [GLOBAL(rd)]
         psraw       xmm2,       VP8_FILTER_SHIFT
 
-        paddw       xmm4,       [rd GLOBAL]
+        paddw       xmm4,       [GLOBAL(rd)]
         psraw       xmm3,       VP8_FILTER_SHIFT
 
-        paddw       xmm5,       [rd GLOBAL]
+        paddw       xmm5,       [GLOBAL(rd)]
         psraw       xmm4,       VP8_FILTER_SHIFT
 
-        paddw       xmm6,       [rd GLOBAL]
+        paddw       xmm6,       [GLOBAL(rd)]
         psraw       xmm5,       VP8_FILTER_SHIFT
 
         psraw       xmm6,       VP8_FILTER_SHIFT
@@ -1356,10 +1395,10 @@ b8x8_sp_only:
         punpcklbw   xmm1,       xmm2
 
         pmaddubsw   xmm1,       xmm0
-        paddw       xmm7,       [rd GLOBAL]
+        paddw       xmm7,       [GLOBAL(rd)]
 
         psraw       xmm7,       VP8_FILTER_SHIFT
-        paddw       xmm1,       [rd GLOBAL]
+        paddw       xmm1,       [GLOBAL(rd)]
 
         psraw       xmm1,       VP8_FILTER_SHIFT
         packuswb    xmm7,       xmm7
@@ -1408,16 +1447,16 @@ b8x8_fp_only:
         punpcklbw   xmm7,       xmm2
         pmaddubsw   xmm7,       xmm0
 
-        paddw       xmm1,       [rd GLOBAL]
+        paddw       xmm1,       [GLOBAL(rd)]
         psraw       xmm1,       VP8_FILTER_SHIFT
 
-        paddw       xmm3,       [rd GLOBAL]
+        paddw       xmm3,       [GLOBAL(rd)]
         psraw       xmm3,       VP8_FILTER_SHIFT
 
-        paddw       xmm5,       [rd GLOBAL]
+        paddw       xmm5,       [GLOBAL(rd)]
         psraw       xmm5,       VP8_FILTER_SHIFT
 
-        paddw       xmm7,       [rd GLOBAL]
+        paddw       xmm7,       [GLOBAL(rd)]
         psraw       xmm7,       VP8_FILTER_SHIFT
 
         packuswb    xmm1,       xmm1
@@ -1464,6 +1503,13 @@ shuf3b:
     db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
 
 align 16
+shuf2bfrom1:
+    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
 rd:
     times 8 dw 0x40
 
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index 950d96262..8dd07c90d 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -128,7 +128,7 @@ void vp8_sixtap_predict4x4_mmx
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16);  // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
     HFilter = vp8_six_tap_mmx[xoffset];
     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
@@ -149,7 +149,7 @@ void vp8_sixtap_predict16x16_mmx
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);  // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);  /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -181,7 +181,7 @@ void vp8_sixtap_predict8x8_mmx
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -207,7 +207,7 @@ void vp8_sixtap_predict8x4_mmx
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -253,7 +253,7 @@ void vp8_sixtap_predict16x16_sse2
 
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);    // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -268,14 +268,14 @@ void vp8_sixtap_predict16x16_sse2
         }
         else
         {
-            // First-pass only
+            /* First-pass only */
             HFilter = vp8_six_tap_mmx[xoffset];
             vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
         }
     }
     else
     {
-        // Second-pass only
+        /* Second-pass only */
         VFilter = vp8_six_tap_mmx[yoffset];
         vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 21, 32);
         vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
@@ -293,7 +293,7 @@ void vp8_sixtap_predict8x8_sse2
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
 
     if (xoffset)
@@ -307,14 +307,14 @@ void vp8_sixtap_predict8x8_sse2
         }
         else
         {
-            // First-pass only
+            /* First-pass only */
             HFilter = vp8_six_tap_mmx[xoffset];
             vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
         }
     }
     else
     {
-        // Second-pass only
+        /* Second-pass only */
         VFilter = vp8_six_tap_mmx[yoffset];
         vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
     }
@@ -331,7 +331,7 @@ void vp8_sixtap_predict8x4_sse2
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  // Temp data bufffer used in filtering
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
 
     if (xoffset)
@@ -345,14 +345,14 @@ void vp8_sixtap_predict8x4_sse2
         }
         else
         {
-            // First-pass only
+            /* First-pass only */
             HFilter = vp8_six_tap_mmx[xoffset];
             vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
         }
     }
     else
     {
-        // Second-pass only
+        /* Second-pass only */
         VFilter = vp8_six_tap_mmx[yoffset];
         vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
     }
@@ -444,13 +444,13 @@ void vp8_sixtap_predict16x16_ssse3
         }
         else
         {
-            // First-pass only
+            /* First-pass only */
             vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset);
         }
     }
     else
     {
-        // Second-pass only
+        /* Second-pass only */
         vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset);
     }
 }
@@ -481,7 +481,7 @@ void vp8_sixtap_predict8x8_ssse3
     }
     else
     {
-        // Second-pass only
+        /* Second-pass only */
         vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset);
     }
 }
@@ -508,13 +508,13 @@ void vp8_sixtap_predict8x4_ssse3
         }
         else
         {
-            // First-pass only
+            /* First-pass only */
             vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
         }
     }
     else
     {
-        // Second-pass only
+        /* Second-pass only */
         vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
     }
 }
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index b983bc27e..38500fd01 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -74,7 +74,7 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
 
 #if CONFIG_POSTPROC
         rtcd->postproc.down        = vp8_mbpost_proc_down_mmx;
-        //rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
+        /*rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;*/
         rtcd->postproc.downacross  = vp8_post_proc_down_and_across_mmx;
         rtcd->postproc.addnoise    = vp8_plane_add_noise_mmx;
 #endif
diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c
new file mode 100644
index 000000000..e9741e286
--- /dev/null
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "blockd.h"
+#include "pragmas.h"
+#include "postproc.h"
+#include "dboolhuff.h"
+#include "dequantize.h"
+#include "onyxd_int.h"
+
+void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    int flags = pbi->common.rtcd.flags;
+    int has_edsp = flags & HAS_EDSP;
+    int has_media = flags & HAS_MEDIA;
+    int has_neon = flags & HAS_NEON;
+
+#if HAVE_ARMV6
+    if (has_media)
+    {
+        pbi->dequant.block               = vp8_dequantize_b_v6;
+        pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;
+        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_v6;
+        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
+        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
+        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
+#if 0 /*For use with RTCD, when implemented*/
+        pbi->dboolhuff.start             = vp8dx_start_decode_c;
+        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
+        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
+        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
+#endif
+    }
+#endif
+
+#if HAVE_ARMV7
+    if (has_neon)
+    {
+        pbi->dequant.block               = vp8_dequantize_b_neon;
+        pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;
+        /*This is not used: NEON always dequants two blocks at once.
+        pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_neon;*/
+        pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
+        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
+        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
+#if 0 /*For use with RTCD, when implemented*/
+        pbi->dboolhuff.start             = vp8dx_start_decode_c;
+        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
+        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
+        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
+#endif
+    }
+#endif
+#endif
+}
diff --git a/vp8/decoder/arm/dboolhuff_arm.h b/vp8/decoder/arm/dboolhuff_arm.h
index d2ebc71e8..985951c7c 100644
--- a/vp8/decoder/arm/dboolhuff_arm.h
+++ b/vp8/decoder/arm/dboolhuff_arm.h
@@ -11,7 +11,7 @@
  * to be useless. However, its been left (for now)
  * for reference.
  */
-/*
+#if 0
 #if HAVE_ARMV6
 #undef vp8_dbool_start
 #define vp8_dbool_start vp8dx_start_decode_v6
@@ -24,7 +24,7 @@
 
 #undef vp8_dbool_devalue
 #define vp8_dbool_devalue vp8_decode_value_v6
-#endif // HAVE_ARMV6
+#endif /* HAVE_ARMV6 */
 
 #if HAVE_ARMV7
 #undef vp8_dbool_start
@@ -38,6 +38,6 @@
 
 #undef vp8_dbool_devalue
 #define vp8_dbool_devalue vp8_decode_value_neon
-#endif // HAVE_ARMV7
-*/
-#endif // DBOOLHUFF_ARM_H
+#endif /* HAVE_ARMV7 */
+#endif
+#endif /* DBOOLHUFF_ARM_H */
diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/decoder/arm/dequantize_arm.c
index 39265879b..b3e14b793 100644
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/decoder/arm/dequantize_arm.c
@@ -30,7 +30,7 @@ void vp8_dequantize_b_neon(BLOCKD *d)
     int i;
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = &d->dequant[0][0];
+    short *DQC = d->dequant;
 
     vp8_dequantize_b_loop_neon(Q, DQC, DQ);
 }
@@ -42,7 +42,7 @@ void vp8_dequantize_b_v6(BLOCKD *d)
     int i;
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = &d->dequant[0][0];
+    short *DQC = d->dequant;
 
     vp8_dequantize_b_loop_v6(Q, DQC, DQ);
 }
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h
index 40151e01a..b7d800d26 100644
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -20,6 +20,7 @@ extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6)
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_v6
 
@@ -38,6 +39,7 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #undef vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
 #endif
+#endif
 
 #if HAVE_ARMV7
 extern prototype_dequant_block(vp8_dequantize_b_neon);
@@ -47,6 +49,7 @@ extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neo
 extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
 extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_neon
 
@@ -65,5 +68,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 #undef vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
 #endif
+#endif
 
 #endif
diff --git a/vp8/decoder/arm/dsystemdependent.c b/vp8/decoder/arm/dsystemdependent.c
deleted file mode 100644
index 9dcf7b657..000000000
--- a/vp8/decoder/arm/dsystemdependent.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "blockd.h"
-#include "pragmas.h"
-#include "postproc.h"
-#include "dboolhuff.h"
-#include "dequantize.h"
-#include "onyxd_int.h"
-
-void vp8_dmachine_specific_config(VP8D_COMP *pbi)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    pbi->mb.rtcd         = &pbi->common.rtcd;
-#if HAVE_ARMV7
-    pbi->dequant.block   = vp8_dequantize_b_neon;
-    pbi->dboolhuff.start = vp8dx_start_decode_c;
-    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
-
-#elif HAVE_ARMV6
-    pbi->dequant.block   = vp8_dequantize_b_v6;
-    pbi->dboolhuff.start = vp8dx_start_decode_c;
-    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
-#endif
-#endif
-}
diff --git a/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
deleted file mode 100644
index f68a78095..000000000
--- a/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm
+++ /dev/null
@@ -1,136 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequant_dc_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
-;                                  unsigned char *dest, int pitch, int stride,
-;                                  int Dc);
-; r0    short *input,
-; r1    short *dq,
-; r2    unsigned char *pred
-; r3    unsigned char *dest
-; sp    int pitch
-; sp+4  int stride
-; sp+8  int Dc
-|vp8_dequant_dc_idct_add_neon| PROC
-    vld1.16         {q3, q4}, [r0]
-    vld1.16         {q5, q6}, [r1]
-
-    ldr             r1, [sp, #8]            ;load Dc from stack
-
-    ldr             r12, _CONSTANTS_
-
-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
-    vmul.i16        q2, q4, q6
-
-    vmov.16         d2[0], r1
-
-    ldr             r1, [sp]                ; pitch
-    vld1.32         {d14[0]}, [r2], r1
-    vld1.32         {d14[1]}, [r2], r1
-    vld1.32         {d15[0]}, [r2], r1
-    vld1.32         {d15[1]}, [r2]
-
-    ldr             r1, [sp, #4]            ; stride
-
-;|short_idct4x4llm_neon| PROC
-    vld1.16         {d0}, [r12]
-    vswp            d3, d4                  ;q2(vp[4] vp[12])
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-; memset(input, 0, 32) -- 32bytes
-    vmov.i16        q14, #0
-
-    vswp            d3, d4
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vmov            q15, q14
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vst1.16         {q14, q15}, [r0]
-
-    vrshr.s16       d2, d2, #3
-    vrshr.s16       d3, d3, #3
-    vrshr.s16       d4, d4, #3
-    vrshr.s16       d5, d5, #3
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vaddw.u8        q1, q1, d14
-    vaddw.u8        q2, q2, d15
-
-    vqmovun.s16     d0, q1
-    vqmovun.s16     d1, q2
-
-    vst1.32         {d0[0]}, [r3], r1
-    vst1.32         {d0[1]}, [r3], r1
-    vst1.32         {d1[0]}, [r3], r1
-    vst1.32         {d1[1]}, [r3]
-
-    bx             lr
-
-    ENDP           ; |vp8_dequant_dc_idct_add_neon|
-
-; Constant Pool
-_CONSTANTS_       DCD cospi8sqrt2minus1
-cospi8sqrt2minus1 DCD 0x4e7b4e7b
-sinpi8sqrt2       DCD 0x8a8c8a8c
-
-    END
diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c
index 4725e6240..fe4f2e0d4 100644
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -12,6 +12,21 @@
 #include "idct.h"
 #include "dequantize.h"
 
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_dc_full_2x_neon
+            (short *input, short *dq, unsigned char *pre, unsigned char *dst,
+             int stride, short *dc);
+void idct_dequant_dc_0_2x_neon
+            (short *dc, unsigned char *pre, unsigned char *dst, int stride);
+void idct_dequant_full_2x_neon
+            (short *q, short *dq, unsigned char *pre, unsigned char *dst,
+             int pitch, int stride);
+void idct_dequant_0_2x_neon
+            (short *q, short dq, unsigned char *pre, int pitch,
+             unsigned char *dst, int stride);
+
 void vp8_dequant_dc_idct_add_y_block_neon
             (short *q, short *dq, unsigned char *pre,
              unsigned char *dst, int stride, char *eobs, short *dc)
@@ -20,25 +35,15 @@ void vp8_dequant_dc_idct_add_y_block_neon
 
     for (i = 0; i < 4; i++)
     {
-        if (eobs[0] > 1)
-            vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);
-        else
-            vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);
-
-        if (eobs[1] > 1)
-            vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
-        else
-            vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);
-
-        if (eobs[2] > 1)
-            vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
         else
-            vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);
+            idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
 
-        if (eobs[3] > 1)
-            vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
         else
-            vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);
+            idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);
 
         q    += 64;
         dc   += 4;
@@ -56,37 +61,15 @@ void vp8_dequant_idct_add_y_block_neon
 
     for (i = 0; i < 4; i++)
     {
-        if (eobs[0] > 1)
-            vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
         else
-        {
-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);
-            ((int *)q)[0] = 0;
-        }
+            idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);
 
-        if (eobs[1] > 1)
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
         else
-        {
-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);
-            ((int *)(q+16))[0] = 0;
-        }
-
-        if (eobs[2] > 1)
-            vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);
-            ((int *)(q+32))[0] = 0;
-        }
-
-        if (eobs[3] > 1)
-            vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);
-            ((int *)(q+48))[0] = 0;
-        }
+            idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);
 
         q    += 64;
         pre  += 64;
@@ -99,53 +82,34 @@ void vp8_dequant_idct_add_uv_block_neon
             (short *q, short *dq, unsigned char *pre,
              unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
-    int i;
-
-    for (i = 0; i < 2; i++)
-    {
-        if (eobs[0] > 1)
-            vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);
-            ((int *)q)[0] = 0;
-        }
-
-        if (eobs[1] > 1)
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);
-            ((int *)(q+16))[0] = 0;
-        }
-
-        q    += 32;
-        pre  += 32;
-        dstu += 4*stride;
-        eobs += 2;
-    }
-
-    for (i = 0; i < 2; i++)
-    {
-        if (eobs[0] > 1)
-            vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);
-            ((int *)q)[0] = 0;
-        }
-
-        if (eobs[1] > 1)
-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);
-        else
-        {
-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);
-            ((int *)(q+16))[0] = 0;
-        }
-
-        q    += 32;
-        pre  += 32;
-        dstv += 4*stride;
-        eobs += 2;
-    }
+    if (((short *)eobs)[0] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
+
+    q    += 32;
+    pre  += 32;
+    dstu += 4*stride;
+
+    if (((short *)eobs)[1] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
+
+    q += 32;
+    pre += 32;
+
+    if (((short *)eobs)[2] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
+
+    q    += 32;
+    pre  += 32;
+    dstv += 4*stride;
+
+    if (((short *)eobs)[3] & 0xfefe)
+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
+    else
+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
 }
diff --git a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
new file mode 100644
index 000000000..456f8e1d4
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@@ -0,0 +1,79 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
+;                            int pitch, unsigned char *dst, int stride);
+; r0   *q
+; r1   dq
+; r2   *pre
+; r3   pitch
+; sp   *dst
+; sp+4 stride
+|idct_dequant_0_2x_neon| PROC
+    add             r12, r2, #4
+    vld1.32         {d2[0]}, [r2], r3
+    vld1.32         {d2[1]}, [r2], r3
+    vld1.32         {d4[0]}, [r2], r3
+    vld1.32         {d4[1]}, [r2]
+    vld1.32         {d8[0]}, [r12], r3
+    vld1.32         {d8[1]}, [r12], r3
+    vld1.32         {d10[0]}, [r12], r3
+    vld1.32         {d10[1]}, [r12]
+
+    ldrh            r12, [r0]               ; lo q
+    ldrh            r2, [r0, #32]           ; hi q
+    mov             r3, #0
+    strh            r3, [r0]
+    strh            r3, [r0, #32]
+
+    sxth            r12, r12                ; lo
+    mul             r0, r12, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q0, r0
+    sxth            r2, r2                  ; hi
+    mul             r0, r2, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    ldr             r2, [sp]                ; dst
+    ldr             r3, [sp, #4]            ; stride
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    add             r0, r2, #4
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d10[1]}, [r0]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_0_2x_neon|
+    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
new file mode 100644
index 000000000..0dc036acb
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
@@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_dc_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
+;                               unsigned char *dst, int stride);
+; r0  *dc
+; r1  *pre
+; r2  *dst
+; r3  stride
+|idct_dequant_dc_0_2x_neon| PROC
+    ldr             r0, [r0]                ; *dc
+    mov             r12, #16
+
+    vld1.32         {d2[0]}, [r1], r12      ; lo
+    vld1.32         {d2[1]}, [r1], r12
+    vld1.32         {d4[0]}, [r1], r12
+    vld1.32         {d4[1]}, [r1]
+    sub             r1, r1, #44
+    vld1.32         {d8[0]}, [r1], r12      ; hi
+    vld1.32         {d8[1]}, [r1], r12
+    vld1.32         {d10[0]}, [r1], r12
+    vld1.32         {d10[1]}, [r1]
+
+    sxth            r1, r0                  ; lo *dc
+    add             r1, r1, #4
+    asr             r1, r1, #3
+    vdup.16         q0, r1
+    sxth            r0, r0, ror #16         ; hi *dc
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    add             r0, r2, #4
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d10[1]}, [r0]
+
+    bx             lr
+
+    ENDP           ;|idct_dequant_dc_0_2x_neon|
+    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
new file mode 100644
index 000000000..ad4364adc
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
@@ -0,0 +1,206 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_dc_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;                                  unsigned char *dst, int stride, short *dc);
+; r0    *q,
+; r1    *dq,
+; r2    *pre
+; r3    *dst
+; sp    stride
+; sp+4  *dc
+|idct_dequant_dc_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    mov             r1, #16                 ; pitch
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2], r1      ; l pre
+    vld1.32         {d28[1]}, [r12], r1     ; r pre
+    vld1.32         {d29[0]}, [r2], r1
+    vld1.32         {d29[1]}, [r12], r1
+    vld1.32         {d30[0]}, [r2], r1
+    vld1.32         {d30[1]}, [r12], r1
+    vld1.32         {d31[0]}, [r2]
+    ldr             r1, [sp, #4]
+    vld1.32         {d31[1]}, [r12]
+
+    ldr             r2, _CONSTANTS_
+
+    ldrh            r12, [r1], #2           ; lo *dc
+    ldrh            r1, [r1]                ; hi *dc
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    ; move dc up to neon and overwrite first element
+    vmov.16         d4[0], r12
+    vmov.16         d8[0], r1
+
+    vld1.16         {d0}, [r2]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    ldr             r1, [sp]                ; stride
+    add             r2, r3, #4              ; hi
+    vst1.32         {d0[0]}, [r3], r1       ; lo
+    vst1.32         {d0[1]}, [r2], r1       ; hi
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r2], r1
+    vst1.32         {d2[0]}, [r3], r1
+    vst1.32         {d2[1]}, [r2], r1
+    vst1.32         {d3[0]}, [r3]
+    vst1.32         {d3[1]}, [r2]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_dc_full_2x_neon|
+
+; Constant Pool
+_CONSTANTS_       DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
diff --git a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
new file mode 100644
index 000000000..85fff11b3
--- /dev/null
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;                               unsigned char *dst, int pitch, int stride);
+; r0    *q,
+; r1    *dq,
+; r2    *pre
+; r3    *dst
+; sp    pitch
+; sp+4  stride
+|idct_dequant_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    ldr             r1, [sp]                ; pitch
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2], r1      ; l pre
+    vld1.32         {d28[1]}, [r12], r1     ; r pre
+    vld1.32         {d29[0]}, [r2], r1
+    vld1.32         {d29[1]}, [r12], r1
+    vld1.32         {d30[0]}, [r2], r1
+    vld1.32         {d30[1]}, [r12], r1
+    vld1.32         {d31[0]}, [r2]
+    vld1.32         {d31[1]}, [r12]
+
+    ldr             r2, _CONSTANTS_
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    vld1.16         {d0}, [r2]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    ldr             r1, [sp, #4]            ; stride
+    add             r2, r3, #4              ; hi
+    vst1.32         {d0[0]}, [r3], r1       ; lo
+    vst1.32         {d0[1]}, [r2], r1       ; hi
+    vst1.32         {d1[0]}, [r3], r1
+    vst1.32         {d1[1]}, [r2], r1
+    vst1.32         {d2[0]}, [r3], r1
+    vst1.32         {d2[1]}, [r2], r1
+    vst1.32         {d3[0]}, [r3]
+    vst1.32         {d3[1]}, [r2]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+_CONSTANTS_       DCD cospi8sqrt2minus1
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index c72bc0330..c851aa7e5 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -45,7 +45,7 @@ typedef struct
     const unsigned char *source, unsigned int source_sz)
 #define prototype_dbool_fill(sym) void sym(BOOL_DECODER *br)
 #define prototype_dbool_debool(sym) int sym(BOOL_DECODER *br, int probability)
-#define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits);
+#define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits)
 
 #if ARCH_ARM
 #include "arm/dboolhuff_arm.h"
@@ -84,16 +84,17 @@ typedef struct vp8_dboolhuff_rtcd_vtable {
     vp8_dbool_devalue_fn_t devalue;
 } vp8_dboolhuff_rtcd_vtable_t;
 
-// There are no processor-specific versions of these
-// functions right now. Disable RTCD to avoid using
-// function pointers which gives a speed boost
-//#ifdef ENABLE_RUNTIME_CPU_DETECT
-//#define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
-//#define IF_RTCD(x) (x)
-//#else
+/* There are no processor-specific versions of these
+ * functions right now. Disable RTCD to avoid using
+ * function pointers which gives a speed boost
+ */
+/*#ifdef ENABLE_RUNTIME_CPU_DETECT
+#define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
+#define IF_RTCD(x) (x)
+#else*/
 #define DBOOLHUFF_INVOKE(ctx,fn) vp8_dbool_##fn
 #define IF_RTCD(x) NULL
-//#endif
+/*#endif*/
 
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index e9281f7ae..203d72dd2 100755..100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -51,10 +51,10 @@ static int vp8_read_uv_mode(vp8_reader *bc, const vp8_prob *p)
 
 static void vp8_read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
 {
-    // Is segmentation enabled
+    /* Is segmentation enabled */
     if (x->segmentation_enabled && x->update_mb_segmentation_map)
     {
-        // If so then read the segment id.
+        /* If so then read the segment id. */
         if (vp8_read(r, x->mb_segment_tree_probs[0]))
             mi->segment_id = (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2]));
         else
@@ -70,14 +70,15 @@ static void vp8_kfread_modes(VP8D_COMP *pbi, MODE_INFO *m, int mb_row, int mb_co
         {
             MB_PREDICTION_MODE y_mode;
 
-            // Read the Macroblock segmentation map if it is being updated explicitly this frame (reset to 0 above by default)
-            // By default on a key frame reset all MBs to segment 0
+            /* Read the Macroblock segmentation map if it is being updated explicitly this frame (reset to 0 above by default)
+             * By default on a key frame reset all MBs to segment 0
+             */
             m->mbmi.segment_id = 0;
 
             if (pbi->mb.update_mb_segmentation_map)
                 vp8_read_mb_features(bc, &m->mbmi, &pbi->mb);
 
-            // Read the macroblock coeff skip flag if this feature is in use, else default to 0
+            /* Read the macroblock coeff skip flag if this feature is in use, else default to 0 */
             if (pbi->common.mb_no_coeff_skip)
                 m->mbmi.mb_skip_coeff = vp8_read(bc, pbi->prob_skip_false);
             else
@@ -306,8 +307,9 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
 
     mbmi->need_to_clamp_mvs = 0;
-    // Distance of Mb to the various image edges.
-    // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+    /* Distance of Mb to the various image edges.
+     * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+     */
     pbi->mb.mb_to_left_edge =
     mb_to_left_edge = -((mb_col * 16) << 3);
     mb_to_left_edge -= LEFT_TOP_MARGIN;
@@ -316,11 +318,11 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
     mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
 
-    // If required read in new segmentation data for this MB
+    /* If required read in new segmentation data for this MB */
     if (pbi->mb.update_mb_segmentation_map)
         vp8_read_mb_features(bc, mbmi, &pbi->mb);
 
-    // Read the macroblock coeff skip flag if this feature is in use, else default to 0
+    /* Read the macroblock coeff skip flag if this feature is in use, else default to 0 */
     if (pbi->common.mb_no_coeff_skip)
         mbmi->mb_skip_coeff = vp8_read(bc, pbi->prob_skip_false);
     else
@@ -362,7 +364,7 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
                 mv_contz = vp8_mv_cont(&(vp8_left_bmi(mi, k)->mv.as_mv), &(vp8_above_bmi(mi, k, mis)->mv.as_mv));
 
-                switch (bmi.mode = (B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) //pc->fc.sub_mv_ref_prob))
+                switch (bmi.mode = (B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/
                 {
                 case NEW4X4:
                     read_mv(bc, mv, (const MV_CONTEXT *) mvc);
@@ -394,7 +396,7 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                     break;
                 }
 
-                mbmi->need_to_clamp_mvs = (mv->col < mb_to_left_edge) ? 1 : 0;
+                mbmi->need_to_clamp_mvs |= (mv->col < mb_to_left_edge) ? 1 : 0;
                 mbmi->need_to_clamp_mvs |= (mv->col > mb_to_right_edge) ? 1 : 0;
                 mbmi->need_to_clamp_mvs |= (mv->row < mb_to_top_edge) ? 1 : 0;
                 mbmi->need_to_clamp_mvs |= (mv->row > mb_to_bottom_edge) ? 1 : 0;
@@ -425,7 +427,7 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
         case NEARMV:
             *mv = nearby;
-            // Clip "next_nearest" so that it does not extend to far out of image
+            /* Clip "next_nearest" so that it does not extend to far out of image */
             mv->col = (mv->col < mb_to_left_edge) ? mb_to_left_edge : mv->col;
             mv->col = (mv->col > mb_to_right_edge) ? mb_to_right_edge : mv->col;
             mv->row = (mv->row < mb_to_top_edge) ? mb_to_top_edge : mv->row;
@@ -434,7 +436,7 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
         case NEARESTMV:
             *mv = nearest;
-            // Clip "next_nearest" so that it does not extend to far out of image
+            /* Clip "next_nearest" so that it does not extend to far out of image */
             mv->col = (mv->col < mb_to_left_edge) ? mb_to_left_edge : mv->col;
             mv->col = (mv->col > mb_to_right_edge) ? mb_to_right_edge : mv->col;
             mv->row = (mv->row < mb_to_top_edge) ? mb_to_top_edge : mv->row;
@@ -462,12 +464,12 @@ void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
         propagate_mv:  /* same MV throughout */
             {
-                //int i=0;
-                //do
-                //{
-                //  mi->bmi[i].mv.as_mv = *mv;
-                //}
-                //while( ++i < 16);
+                /*int i=0;
+                do
+                {
+                  mi->bmi[i].mv.as_mv = *mv;
+                }
+                while( ++i < 16);*/
 
                 mi->bmi[0].mv.as_mv = *mv;
                 mi->bmi[1].mv.as_mv = *mv;
@@ -541,16 +543,16 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
 
         while (++mb_col < pbi->common.mb_cols)
         {
-//          vp8_read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);
+            /*vp8_read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);*/
             if(pbi->common.frame_type == KEY_FRAME)
                 vp8_kfread_modes(pbi, mi, mb_row, mb_col);
             else
                 vp8_read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);
 
-            mi++;       // next macroblock
+            mi++;       /* next macroblock */
         }
 
-        mi++;           // skip left predictor each row
+        mi++;           /* skip left predictor each row */
     }
 }
 
diff --git a/vp8/decoder/decoderthreading.h b/vp8/decoder/decoderthreading.h
index c8e3f02f9..25dee8fe8 100644
--- a/vp8/decoder/decoderthreading.h
+++ b/vp8/decoder/decoderthreading.h
@@ -15,12 +15,12 @@
 #ifndef _DECODER_THREADING_H
 #define _DECODER_THREADING_H
 
-
-extern void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
-                                 MACROBLOCKD *xd);
-extern void vp8_mt_loop_filter_frame(VP8D_COMP *pbi);
-extern void vp8_stop_lfthread(VP8D_COMP *pbi);
-extern void vp8_start_lfthread(VP8D_COMP *pbi);
+#if CONFIG_MULTITHREAD
+extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
 extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
 extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
+extern int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
+#endif
+
 #endif
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 4f5b7c7a2..4702faeed 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -21,7 +21,8 @@
 #include "alloccommon.h"
 #include "entropymode.h"
 #include "quant_common.h"
-
+#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/yv12extend.h"
 #include "setupintrarecon.h"
 
 #include "decodemv.h"
@@ -39,56 +40,53 @@
 
 void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
 {
-    int r, c;
     int i;
     int Q;
     VP8_COMMON *const pc = & pbi->common;
 
     for (Q = 0; Q < QINDEX_RANGE; Q++)
     {
-        pc->Y1dequant[Q][0][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
-        pc->Y2dequant[Q][0][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
-        pc->UVdequant[Q][0][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
+        pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
+        pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
+        pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
 
-        // all the ac values = ;
+        /* all the ac values = ; */
         for (i = 1; i < 16; i++)
         {
             int rc = vp8_default_zig_zag1d[i];
-            r = (rc >> 2);
-            c = (rc & 3);
 
-            pc->Y1dequant[Q][r][c] = (short)vp8_ac_yquant(Q);
-            pc->Y2dequant[Q][r][c] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
-            pc->UVdequant[Q][r][c] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
+            pc->Y1dequant[Q][rc] = (short)vp8_ac_yquant(Q);
+            pc->Y2dequant[Q][rc] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+            pc->UVdequant[Q][rc] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
         }
     }
 }
 
-static void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
+void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
     int i;
     int QIndex;
     MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
     VP8_COMMON *const pc = & pbi->common;
 
-    // Decide whether to use the default or alternate baseline Q value.
+    /* Decide whether to use the default or alternate baseline Q value. */
     if (xd->segmentation_enabled)
     {
-        // Abs Value
+        /* Abs Value */
         if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
             QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
 
-        // Delta Value
+        /* Delta Value */
         else
         {
             QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
-            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    // Clamp to valid range
+            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
         }
     }
     else
         QIndex = pc->base_qindex;
 
-    // Set up the block level dequant pointers
+    /* Set up the block level dequant pointers */
     for (i = 0; i < 16; i++)
     {
         xd->block[i].dequant = pc->Y1dequant[QIndex];
@@ -109,8 +107,9 @@ static void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 #define RTCD_VTABLE(x) NULL
 #endif
 
-//skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
-// to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
+ *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+ */
 static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
     if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
@@ -158,7 +157,7 @@ static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
     mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ? (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
 }
 
-static void clamp_mvs(MACROBLOCKD *xd)
+void clamp_mvs(MACROBLOCKD *xd)
 {
     if (xd->mode_info_context->mbmi.mode == SPLITMV)
     {
@@ -209,7 +208,7 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
     if (xd->segmentation_enabled)
         mb_init_dequantizer(pbi, xd);
 
-    // do prediction
+    /* do prediction */
     if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
         vp8_build_intra_predictors_mbuv(xd);
@@ -226,13 +225,13 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
         vp8_build_inter_predictors_mb(xd);
     }
 
-    // dequantization and idct
+    /* dequantization and idct */
     if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
     {
         BLOCKD *b = &xd->block[24];
         DEQUANT_INVOKE(&pbi->dequant, block)(b);
 
-        // do 2nd order transform on the dc block
+        /* do 2nd order transform on the dc block */
         if (xd->eobs[24] > 1)
         {
             IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
@@ -252,7 +251,7 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
         }
 
         DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
-                        (xd->qcoeff, &xd->block[0].dequant[0][0],
+                        (xd->qcoeff, xd->block[0].dequant,
                          xd->predictor, xd->dst.y_buffer,
                          xd->dst.y_stride, xd->eobs, xd->block[24].diff);
     }
@@ -267,13 +266,13 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
             if (xd->eobs[i] > 1)
             {
                 DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                    (b->qcoeff, &b->dequant[0][0],  b->predictor,
+                    (b->qcoeff, b->dequant,  b->predictor,
                     *(b->base_dst) + b->dst, 16, b->dst_stride);
             }
             else
             {
                 IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                    (b->qcoeff[0] * b->dequant[0][0], b->predictor,
+                    (b->qcoeff[0] * b->dequant[0], b->predictor,
                     *(b->base_dst) + b->dst, 16, b->dst_stride);
                 ((int *)b->qcoeff)[0] = 0;
             }
@@ -283,17 +282,18 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
     else
     {
         DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
-                        (xd->qcoeff, &xd->block[0].dequant[0][0],
+                        (xd->qcoeff, xd->block[0].dequant,
                          xd->predictor, xd->dst.y_buffer,
                          xd->dst.y_stride, xd->eobs);
     }
 
     DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
-                    (xd->qcoeff+16*16, &xd->block[16].dequant[0][0],
+                    (xd->qcoeff+16*16, xd->block[16].dequant,
                      xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
                      xd->dst.uv_stride, xd->eobs+16);
 }
 
+
 static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
 {
     int ret_val = 0;
@@ -337,7 +337,7 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
     vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
     recon_yoffset = mb_row * recon_y_stride * 16;
     recon_uvoffset = mb_row * recon_uv_stride * 8;
-    // reset above block coeffs
+    /* reset above block coeffs */
 
     xd->above_context = pc->above_context;
     xd->up_available = (mb_row != 0);
@@ -357,8 +357,9 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
             }
         }
 
-        // Distance of Mb to the various image edges.
-        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+        /* Distance of Mb to the various image edges.
+         * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+         */
         xd->mb_to_left_edge = -((mb_col * 16) << 3);
         xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
 
@@ -368,7 +369,7 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
 
         xd->left_available = (mb_col != 0);
 
-        // Select the appropriate reference frame for this MB
+        /* Select the appropriate reference frame for this MB */
         if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
             ref_fb_idx = pc->lst_fb_idx;
         else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
@@ -398,18 +399,15 @@ void vp8_decode_mb_row(VP8D_COMP *pbi,
 
         xd->above_context++;
 
-        pbi->current_mb_col_main = mb_col;
     }
 
-    // adjust to the next row of mbs
+    /* adjust to the next row of mbs */
     vp8_extend_mb_row(
         &pc->yv12_fb[dst_fb_idx],
         xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
     );
 
     ++xd->mode_info_context;      /* skip prediction column */
-
-    pbi->last_mb_row_decoded = mb_row;
 }
 
 
@@ -449,7 +447,7 @@ static void setup_token_decoder(VP8D_COMP *pbi,
     for (i = 0; i < num_part; i++)
     {
         const unsigned char *partition_size_ptr = cx_data + i * 3;
-        unsigned int         partition_size;
+        ptrdiff_t            partition_size;
 
         /* Calculate the length of this partition. The last partition
          * size is implicit.
@@ -463,7 +461,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
             partition_size = user_data_end - partition;
         }
 
-        if (partition + partition_size > user_data_end)
+        if (partition + partition_size > user_data_end
+            || partition + partition_size < partition)
             vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                                "Truncated packet or corrupt partition "
                                "%d length", i + 1);
@@ -500,7 +499,7 @@ static void init_frame(VP8D_COMP *pbi)
 
     if (pc->frame_type == KEY_FRAME)
     {
-        // Various keyframe initializations
+        /* Various keyframe initializations */
         vpx_memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
 
         vp8_init_mbmode_probs(pc);
@@ -508,22 +507,23 @@ static void init_frame(VP8D_COMP *pbi)
         vp8_default_coef_probs(pc);
         vp8_kf_default_bmode_probs(pc->kf_bmode_prob);
 
-        // reset the segment feature data to 0 with delta coding (Default state).
+        /* reset the segment feature data to 0 with delta coding (Default state). */
         vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
         xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
 
-       // reset the mode ref deltasa for loop filter
+        /* reset the mode ref deltasa for loop filter */
         vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
         vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
 
-        // All buffers are implicitly updated on key frames.
+        /* All buffers are implicitly updated on key frames. */
         pc->refresh_golden_frame = 1;
         pc->refresh_alt_ref_frame = 1;
         pc->copy_buffer_to_gf = 0;
         pc->copy_buffer_to_arf = 0;
 
-        // Note that Golden and Altref modes cannot be used on a key frame so
-        // ref_frame_sign_bias[] is undefined and meaningless
+        /* Note that Golden and Altref modes cannot be used on a key frame so
+         * ref_frame_sign_bias[] is undefined and meaningless
+         */
         pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
         pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
     }
@@ -534,7 +534,7 @@ static void init_frame(VP8D_COMP *pbi)
         else
             pc->mcomp_filter_type = BILINEAR;
 
-        // To enable choice of different interploation filters
+        /* To enable choice of different interploation filters */
         if (pc->mcomp_filter_type == SIXTAP)
         {
             xd->subpixel_predict      = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap4x4);
@@ -565,12 +565,15 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     MACROBLOCKD *const xd  = & pbi->mb;
     const unsigned char *data = (const unsigned char *)pbi->Source;
     const unsigned char *const data_end = data + pbi->source_sz;
-    int first_partition_length_in_bytes;
+    ptrdiff_t first_partition_length_in_bytes;
 
     int mb_row;
     int i, j, k, l;
     const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
 
+    if (data_end - data < 3)
+        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Truncated packet");
     pc->frame_type = (FRAME_TYPE)(data[0] & 1);
     pc->version = (data[0] >> 1) & 7;
     pc->show_frame = (data[0] >> 4) & 1;
@@ -578,7 +581,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
     data += 3;
 
-    if (data + first_partition_length_in_bytes > data_end)
+    if (data + first_partition_length_in_bytes > data_end
+        || data + first_partition_length_in_bytes < data)
         vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                            "Truncated packet or corrupt partition 0 length");
     vp8_setup_version(pc);
@@ -588,7 +592,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         const int Width = pc->Width;
         const int Height = pc->Height;
 
-        // vet via sync code
+        /* vet via sync code */
         if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
             vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
                                "Invalid frame sync code");
@@ -601,6 +605,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
         if (Width != pc->Width  ||  Height != pc->Height)
         {
+            int prev_mb_rows = pc->mb_rows;
+
             if (pc->Width <= 0)
             {
                 pc->Width = Width;
@@ -618,6 +624,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
             if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height))
                 vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                                    "Failed to allocate frame buffers");
+
+#if CONFIG_MULTITHREAD
+            if (pbi->b_multithreaded_rd)
+                vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+#endif
         }
     }
 
@@ -637,12 +648,12 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
     }
 
-    // Is segmentation enabled
+    /* Is segmentation enabled */
     xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
 
     if (xd->segmentation_enabled)
     {
-        // Signal whether or not the segmentation map is being explicitly updated this frame.
+        /* Signal whether or not the segmentation map is being explicitly updated this frame. */
         xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc);
         xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
 
@@ -652,12 +663,12 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
             vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
 
-            // For each segmentation feature (Quant and loop filter level)
+            /* For each segmentation feature (Quant and loop filter level) */
             for (i = 0; i < MB_LVL_MAX; i++)
             {
                 for (j = 0; j < MAX_MB_SEGMENTS; j++)
                 {
-                    // Frame level data
+                    /* Frame level data */
                     if (vp8_read_bit(bc))
                     {
                         xd->segment_feature_data[i][j] = (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]);
@@ -673,57 +684,57 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
         if (xd->update_mb_segmentation_map)
         {
-            // Which macro block level features are enabled
+            /* Which macro block level features are enabled */
             vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
 
-            // Read the probs used to decode the segment id for each macro block.
+            /* Read the probs used to decode the segment id for each macro block. */
             for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
             {
-                // If not explicitly set value is defaulted to 255 by memset above
+                /* If not explicitly set value is defaulted to 255 by memset above */
                 if (vp8_read_bit(bc))
                     xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
             }
         }
     }
 
-    // Read the loop filter level and type
+    /* Read the loop filter level and type */
     pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
     pc->filter_level = vp8_read_literal(bc, 6);
     pc->sharpness_level = vp8_read_literal(bc, 3);
 
-    // Read in loop filter deltas applied at the MB level based on mode or ref frame.
+    /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
     xd->mode_ref_lf_delta_update = 0;
     xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc);
 
     if (xd->mode_ref_lf_delta_enabled)
     {
-        // Do the deltas need to be updated
+        /* Do the deltas need to be updated */
         xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc);
 
         if (xd->mode_ref_lf_delta_update)
         {
-            // Send update
+            /* Send update */
             for (i = 0; i < MAX_REF_LF_DELTAS; i++)
             {
                 if (vp8_read_bit(bc))
                 {
-                    //sign = vp8_read_bit( bc );
+                    /*sign = vp8_read_bit( bc );*/
                     xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
 
-                    if (vp8_read_bit(bc))        // Apply sign
+                    if (vp8_read_bit(bc))        /* Apply sign */
                         xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
                 }
             }
 
-            // Send update
+            /* Send update */
             for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
             {
                 if (vp8_read_bit(bc))
                 {
-                    //sign = vp8_read_bit( bc );
+                    /*sign = vp8_read_bit( bc );*/
                     xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
 
-                    if (vp8_read_bit(bc))        // Apply sign
+                    if (vp8_read_bit(bc))        /* Apply sign */
                         xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
                 }
             }
@@ -733,11 +744,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     setup_token_decoder(pbi, data + first_partition_length_in_bytes);
     xd->current_bc = &pbi->bc2;
 
-    // Read the default quantizers.
+    /* Read the default quantizers. */
     {
         int Q, q_update;
 
-        Q = vp8_read_literal(bc, 7);  // AC 1st order Q = default
+        Q = vp8_read_literal(bc, 7);  /* AC 1st order Q = default */
         pc->base_qindex = Q;
         q_update = 0;
         pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update);
@@ -749,20 +760,21 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         if (q_update)
             vp8cx_init_de_quantizer(pbi);
 
-        // MB level dequantizer setup
+        /* MB level dequantizer setup */
         mb_init_dequantizer(pbi, &pbi->mb);
     }
 
-    // Determine if the golden frame or ARF buffer should be updated and how.
-    // For all non key frames the GF and ARF refresh flags and sign bias
-    // flags must be set explicitly.
+    /* Determine if the golden frame or ARF buffer should be updated and how.
+     * For all non key frames the GF and ARF refresh flags and sign bias
+     * flags must be set explicitly.
+     */
     if (pc->frame_type != KEY_FRAME)
     {
-        // Should the GF or ARF be updated from the current frame
+        /* Should the GF or ARF be updated from the current frame */
         pc->refresh_golden_frame = vp8_read_bit(bc);
         pc->refresh_alt_ref_frame = vp8_read_bit(bc);
 
-        // Buffer to buffer copy flags.
+        /* Buffer to buffer copy flags. */
         pc->copy_buffer_to_gf = 0;
 
         if (!pc->refresh_golden_frame)
@@ -800,7 +812,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
 
     {
-        // read coef probability tree
+        /* read coef probability tree */
 
         for (i = 0; i < BLOCK_TYPES; i++)
             for (j = 0; j < COEF_BANDS; j++)
@@ -821,17 +833,18 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
     vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
 
-    // set up frame new frame for intra coded blocks
-    vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
+    /* set up frame new frame for intra coded blocks */
+    if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))
+        vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
 
     vp8_setup_block_dptrs(xd);
 
     vp8_build_block_doffsets(xd);
 
-    // clear out the coeff buffer
+    /* clear out the coeff buffer */
     vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
 
-    // Read the mb_no_coeff_skip flag
+    /* Read the mb_no_coeff_skip flag */
     pc->mb_no_coeff_skip = (int)vp8_read_bit(bc);
 
 
@@ -841,20 +854,25 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
     vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));
 
-
-    if (pbi->b_multithreaded_lf && pc->filter_level != 0)
-        vp8_start_lfthread(pbi);
-
     if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
     {
-        vp8_mtdecode_mb_rows(pbi, xd);
+        vp8mt_decode_mb_rows(pbi, xd);
+        if(pbi->common.filter_level)
+        {
+            /*vp8_mt_loop_filter_frame(pbi);*/ /*cm, &pbi->mb, cm->filter_level);*/
+
+            pc->last_frame_type = pc->frame_type;
+            pc->last_filter_type = pc->filter_type;
+            pc->last_sharpness_level = pc->sharpness_level;
+        }
+        vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/
     }
     else
     {
         int ibc = 0;
         int num_part = 1 << pc->multi_token_partition;
 
-        // Decode the individual macro block
+        /* Decode the individual macro block */
         for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
         {
 
@@ -869,16 +887,14 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
             vp8_decode_mb_row(pbi, pc, mb_row, xd);
         }
-
-        pbi->last_mb_row_decoded = mb_row;
     }
 
 
     stop_token_decoder(pbi);
 
-    // vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos);
+    /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */
 
-    // If this was a kf or Gf note the Q used
+    /* If this was a kf or Gf note the Q used */
     if ((pc->frame_type == KEY_FRAME) ||
          pc->refresh_golden_frame || pc->refresh_alt_ref_frame)
     {
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index 8cfa2a32e..84a9fd943 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -24,7 +24,7 @@ void vp8_dequantize_b_c(BLOCKD *d)
     int i;
     short *DQ  = d->dqcoeff;
     short *Q   = d->qcoeff;
-    short *DQC = &d->dequant[0][0];
+    short *DQC = d->dequant;
 
     for (i = 0; i < 16; i++)
     {
@@ -45,7 +45,7 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
         input[i] = dq[i] * input[i];
     }
 
-    // the idct halves ( >> 1) the pitch
+    /* the idct halves ( >> 1) the pitch */
     vp8_short_idct4x4llm_c(input, output, 4 << 1);
 
     vpx_memset(input, 0, 32);
@@ -87,7 +87,7 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
         input[i] = dq[i] * input[i];
     }
 
-    // the idct halves ( >> 1) the pitch
+    /* the idct halves ( >> 1) the pitch */
     vp8_short_idct4x4llm_c(input, output, 4 << 1);
 
     vpx_memset(input, 0, 32);
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 65c7d5370..7d013d240 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -44,18 +44,18 @@ typedef struct
 
 DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) =
 {
-    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //ZERO_TOKEN
-    {  1, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //ONE_TOKEN
-    {  2, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //TWO_TOKEN
-    {  3, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //THREE_TOKEN
-    {  4, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //FOUR_TOKEN
-    {  5, 0, { 159, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //DCT_VAL_CATEGORY1
-    {  7, 1, { 145, 165, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY2
-    { 11, 2, { 140, 148, 173, 0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY3
-    { 19, 3, { 135, 140, 155, 176, 0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY4
-    { 35, 4, { 130, 134, 141, 157, 180, 0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY5
-    { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0   } }, //DCT_VAL_CATEGORY6
-    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  // EOB TOKEN
+    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  /* ZERO_TOKEN */
+    {  1, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   /* ONE_TOKEN */
+    {  2, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   /* TWO_TOKEN */
+    {  3, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   /* THREE_TOKEN */
+    {  4, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   /* FOUR_TOKEN */
+    {  5, 0, { 159, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  /* DCT_VAL_CATEGORY1 */
+    {  7, 1, { 145, 165, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } }, /* DCT_VAL_CATEGORY2 */
+    { 11, 2, { 140, 148, 173, 0,  0,  0,  0,  0,  0,  0,  0,  0   } }, /* DCT_VAL_CATEGORY3 */
+    { 19, 3, { 135, 140, 155, 176, 0,  0,  0,  0,  0,  0,  0,  0   } }, /* DCT_VAL_CATEGORY4 */
+    { 35, 4, { 130, 134, 141, 157, 180, 0,  0,  0,  0,  0,  0,  0   } }, /* DCT_VAL_CATEGORY5 */
+    { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0   } }, /* DCT_VAL_CATEGORY6 */
+    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  /*  EOB TOKEN */
 };
 
 
@@ -75,13 +75,14 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
 }
 
 #if CONFIG_ARM_ASM_DETOK
-// mashup of vp8_block2left and vp8_block2above so we only need one pointer
-// for the assembly version.
+/* mashup of vp8_block2left and vp8_block2above so we only need one pointer
+ * for the assembly version.
+ */
 DECLARE_ALIGNED(16, const UINT8, vp8_block2leftabove[25*2]) =
 {
-    //vp8_block2left
+    /* vp8_block2left */
     0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-    //vp8_block2above
+    /* vp8_block2above */
     0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
 };
 
@@ -297,7 +298,7 @@ BLOCK_LOOP:
 
     c = (INT16)(!type);
 
-//    Dest = ((A)!=0) + ((B)!=0);
+    /*Dest = ((A)!=0) + ((B)!=0);*/
     VP8_COMBINEENTROPYCONTEXTS(v, *a, *l);
     Prob = coef_probs;
     Prob += v * ENTROPY_NODES;
@@ -387,7 +388,7 @@ ONE_CONTEXT_NODE_0_:
 
     qcoeff_ptr [ scan[15] ] = (INT16) v;
 BLOCK_FINISHED:
-    *a = *l = ((eobs[i] = c) != !type);   // any nonzero data?
+    *a = *l = ((eobs[i] = c) != !type);   /* any nonzero data? */
     eobtotal += c;
     qcoeff_ptr += 16;
 
@@ -422,4 +423,4 @@ BLOCK_FINISHED:
     return eobtotal;
 
 }
-#endif //!CONFIG_ASM_DETOK
+#endif /*!CONFIG_ASM_DETOK*/
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index 60f2af5b8..2e284729b 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -14,10 +14,11 @@
 #include "onyxd_int.h"
 
 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
+extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi);
 
 void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 {
-    // Pure C:
+    /* Pure C: */
 #if CONFIG_RUNTIME_CPU_DETECT
     pbi->mb.rtcd                     = &pbi->common.rtcd;
     pbi->dequant.block               = vp8_dequantize_b_c;
@@ -28,7 +29,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
     pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
     pbi->dboolhuff.start             = vp8dx_start_decode_c;
     pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-#if 0 //For use with RTCD, when implemented
+#if 0 /*For use with RTCD, when implemented*/
     pbi->dboolhuff.debool = vp8dx_decode_bool_c;
     pbi->dboolhuff.devalue = vp8dx_decode_value_c;
 #endif
@@ -37,4 +38,8 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 #if ARCH_X86 || ARCH_X86_64
     vp8_arch_x86_decode_init(pbi);
 #endif
+
+#if ARCH_ARM
+    vp8_arch_arm_decode_init(pbi);
+#endif
 }
diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
index c6a42578a..c98bd5bb8 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -12,6 +12,14 @@
 #include "idct.h"
 #include "dequantize.h"
 
+void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
+                               unsigned char *dest, int pitch, int stride,
+                               int Dc);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
+                            unsigned char *dest, int pitch, int stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+                            unsigned char *dst_ptr, int pitch, int stride);
+
 void vp8_dequant_dc_idct_add_y_block_c
             (short *q, short *dq, unsigned char *pre,
              unsigned char *dst, int stride, char *eobs, short *dc)
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 1651784cf..6eda45e4a 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -30,6 +30,9 @@
 #include "systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
 
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
 extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
@@ -110,12 +113,13 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     pbi->common.current_video_frame = 0;
     pbi->ready_for_new_data = 1;
 
-    pbi->CPUFreq = 0; //vp8_get_processor_freq();
+    pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
     pbi->max_threads = oxcf->max_threads;
     vp8_decoder_create_threads(pbi);
 
-    //vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
-    // unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+    /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
+     *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+     */
     vp8cx_init_de_quantizer(pbi);
 
     {
@@ -142,6 +146,10 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
     if (!pbi)
         return;
 
+#if CONFIG_MULTITHREAD
+    if (pbi->b_multithreaded_rd)
+        vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
+#endif
     vp8_decoder_remove_threads(pbi);
     vp8_remove_common(&pbi->common);
     vpx_free(pbi);
@@ -216,11 +224,10 @@ int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_C
     return 0;
 }
 
-//For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
+/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
 #if HAVE_ARMV7
 extern void vp8_push_neon(INT64 *store);
 extern void vp8_pop_neon(INT64 *store);
-static INT64 dx_store_reg[8];
 #endif
 
 static int get_free_fb (VP8_COMMON *cm)
@@ -244,7 +251,7 @@ static void ref_cnt_fb (int *buf, int *idx, int new_idx)
     buf[new_idx]++;
 }
 
-// If any buffer copy / swapping is signalled it should be done here.
+/* If any buffer copy / swapping is signalled it should be done here. */
 static int swap_frame_buffers (VP8_COMMON *cm)
 {
     int fb_to_update_with, err = 0;
@@ -254,10 +261,11 @@ static int swap_frame_buffers (VP8_COMMON *cm)
     else
         fb_to_update_with = cm->new_fb_idx;
 
-    // The alternate reference frame or golden frame can be updated
-    //  using the new, last, or golden/alt ref frame.  If it
-    //  is updated using the newly decoded frame it is a refresh.
-    //  An update using the last or golden/alt ref frame is a copy.
+    /* The alternate reference frame or golden frame can be updated
+     *  using the new, last, or golden/alt ref frame.  If it
+     *  is updated using the newly decoded frame it is a refresh.
+     *  An update using the last or golden/alt ref frame is a copy.
+     */
     if (cm->copy_buffer_to_arf)
     {
         int new_fb = 0;
@@ -308,13 +316,16 @@ static int swap_frame_buffers (VP8_COMMON *cm)
 
 int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, INT64 time_stamp)
 {
+#if HAVE_ARMV7
+    INT64 dx_store_reg[8];
+#endif
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
     VP8_COMMON *cm = &pbi->common;
     int retcode = 0;
     struct vpx_usec_timer timer;
 
-//  if(pbi->ready_for_new_data == 0)
-//      return -1;
+    /*if(pbi->ready_for_new_data == 0)
+        return -1;*/
 
     if (ptr == 0)
     {
@@ -323,102 +334,114 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 
     pbi->common.error.error_code = VPX_CODEC_OK;
 
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_push_neon(dx_store_reg);
+    }
+#endif
+
+    cm->new_fb_idx = get_free_fb (cm);
+
     if (setjmp(pbi->common.error.jmp))
     {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_pop_neon(dx_store_reg);
+        }
+#endif
         pbi->common.error.setjmp = 0;
+        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
         return -1;
     }
 
     pbi->common.error.setjmp = 1;
 
-#if HAVE_ARMV7
-    vp8_push_neon(dx_store_reg);
-#endif
-
     vpx_usec_timer_start(&timer);
 
-    //cm->current_video_frame++;
+    /*cm->current_video_frame++;*/
     pbi->Source = source;
     pbi->source_sz = size;
 
-    cm->new_fb_idx = get_free_fb (cm);
-
     retcode = vp8_decode_frame(pbi);
 
     if (retcode < 0)
     {
 #if HAVE_ARMV7
-        vp8_pop_neon(dx_store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_pop_neon(dx_store_reg);
+        }
 #endif
         pbi->common.error.error_code = VPX_CODEC_ERROR;
         pbi->common.error.setjmp = 0;
+        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
         return retcode;
     }
 
-    if (pbi->b_multithreaded_lf && pbi->common.filter_level != 0)
-        vp8_stop_lfthread(pbi);
-
-    if (swap_frame_buffers (cm))
+    if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)
     {
-        pbi->common.error.error_code = VPX_CODEC_ERROR;
-        pbi->common.error.setjmp = 0;
-        return -1;
-    }
-
-/*
-    if (!pbi->b_multithreaded_lf)
+        if (swap_frame_buffers (cm))
+        {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+            if (cm->rtcd.flags & HAS_NEON)
+#endif
+            {
+                vp8_pop_neon(dx_store_reg);
+            }
+#endif
+            pbi->common.error.error_code = VPX_CODEC_ERROR;
+            pbi->common.error.setjmp = 0;
+            return -1;
+        }
+    } else
     {
-        struct vpx_usec_timer lpftimer;
-        vpx_usec_timer_start(&lpftimer);
-        // Apply the loop filter if appropriate.
-
-        if (cm->filter_level > 0)
-            vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
-
-        vpx_usec_timer_mark(&lpftimer);
-        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
-    }else{
-      struct vpx_usec_timer lpftimer;
-      vpx_usec_timer_start(&lpftimer);
-      // Apply the loop filter if appropriate.
-
-      if (cm->filter_level > 0)
-          vp8_mt_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
-
-      vpx_usec_timer_mark(&lpftimer);
-      pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
-    }
-    if (cm->filter_level > 0) {
-        cm->last_frame_type = cm->frame_type;
-        cm->last_filter_type = cm->filter_type;
-        cm->last_sharpness_level = cm->sharpness_level;
-    }
-*/
+        if (swap_frame_buffers (cm))
+        {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+            if (cm->rtcd.flags & HAS_NEON)
+#endif
+            {
+                vp8_pop_neon(dx_store_reg);
+            }
+#endif
+            pbi->common.error.error_code = VPX_CODEC_ERROR;
+            pbi->common.error.setjmp = 0;
+            return -1;
+        }
 
-    if(pbi->common.filter_level)
-    {
-        struct vpx_usec_timer lpftimer;
-        vpx_usec_timer_start(&lpftimer);
-        // Apply the loop filter if appropriate.
+        if(pbi->common.filter_level)
+        {
+            struct vpx_usec_timer lpftimer;
+            vpx_usec_timer_start(&lpftimer);
+            /* Apply the loop filter if appropriate. */
 
-        if (pbi->b_multithreaded_lf && cm->multi_token_partition != ONE_PARTITION)
-            vp8_mt_loop_filter_frame(pbi);   //cm, &pbi->mb, cm->filter_level);
-        else
             vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
 
-        vpx_usec_timer_mark(&lpftimer);
-        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
+            vpx_usec_timer_mark(&lpftimer);
+            pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
 
-        cm->last_frame_type = cm->frame_type;
-        cm->last_filter_type = cm->filter_type;
-        cm->last_sharpness_level = cm->sharpness_level;
+            cm->last_frame_type = cm->frame_type;
+            cm->last_filter_type = cm->filter_type;
+            cm->last_sharpness_level = cm->sharpness_level;
+        }
+        vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
     }
 
-    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
 #if 0
-    // DEBUG code
-    //vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
+    /* DEBUG code */
+    /*vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);*/
     if (cm->current_video_frame <= 5)
         write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
 #endif
@@ -430,7 +453,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 
     pbi->time_decoding += pbi->decode_microseconds;
 
-//  vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);
+    /*vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);*/
 
     if (cm->show_frame)
         cm->current_video_frame++;
@@ -473,7 +496,12 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 #endif
 
 #if HAVE_ARMV7
-    vp8_pop_neon(dx_store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_pop_neon(dx_store_reg);
+    }
 #endif
     pbi->common.error.setjmp = 0;
     return retcode;
@@ -486,7 +514,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,
     if (pbi->ready_for_new_data == 1)
         return ret;
 
-    // ie no raw frame to show!!!
+    /* ie no raw frame to show!!! */
     if (pbi->common.show_frame == 0)
         return ret;
 
@@ -512,7 +540,7 @@ int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp,
         ret = -1;
     }
 
-#endif //!CONFIG_POSTPROC
+#endif /*!CONFIG_POSTPROC*/
     vp8_clear_system_state();
     return ret;
 }
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index ad21ae3f5..7593edf27 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -88,30 +88,33 @@ typedef struct VP8Decompressor
     unsigned int time_loop_filtering;
 
     volatile int b_multithreaded_rd;
-    volatile int b_multithreaded_lf;
     int max_threads;
-    int last_mb_row_decoded;
     int current_mb_col_main;
     int decoding_thread_count;
     int allocated_decoding_thread_count;
-    int *current_mb_col;                  //Each row remembers its already decoded column.
-    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
 
-    // variable for threading
-    DECLARE_ALIGNED(16, MACROBLOCKD, lpfmb);
+    /* variable for threading */
 #if CONFIG_MULTITHREAD
-    //pthread_t           h_thread_lpf;         // thread for postprocessing
-    sem_t               h_event_end_lpf;          // Event for post_proc completed
-    sem_t               *h_event_start_lpf;
-#endif
+    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
+    int sync_range;
+    int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
+
+    unsigned char **mt_yabove_row;           /* mb_rows x width */
+    unsigned char **mt_uabove_row;
+    unsigned char **mt_vabove_row;
+    unsigned char **mt_yleft_col;            /* mb_rows x 16 */
+    unsigned char **mt_uleft_col;            /* mb_rows x 8 */
+    unsigned char **mt_vleft_col;            /* mb_rows x 8 */
+
     MB_ROW_DEC           *mb_row_di;
-    DECODETHREAD_DATA   *de_thread_data;
-#if CONFIG_MULTITHREAD
+    DECODETHREAD_DATA    *de_thread_data;
+
     pthread_t           *h_decoding_thread;
     sem_t               *h_event_start_decoding;
-    sem_t               h_event_end_decoding;
-    // end of threading data
+    sem_t                h_event_end_decoding;
+    /* end of threading data */
 #endif
+
     vp8_reader *mbc;
     INT64 last_time_stamp;
     int   ready_for_new_data;
diff --git a/vp8/decoder/reconintra_mt.c b/vp8/decoder/reconintra_mt.c
new file mode 100644
index 000000000..ad4324b27
--- /dev/null
+++ b/vp8/decoder/reconintra_mt.c
@@ -0,0 +1,982 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxd_int.h"
+
+/* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
+ * vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
+ */
+
+void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
+    unsigned char *yleft_col;
+    unsigned char yleft_buf[16];
+    unsigned char ytop_left;      /* = yabove_row[-1]; */
+    unsigned char *ypred_ptr = x->predictor;
+    int r, c, i;
+
+    if (pbi->common.filter_level)
+    {
+        yabove_row = pbi->mt_yabove_row[mb_row] + mb_col*16 +32;
+        yleft_col = pbi->mt_yleft_col[mb_row];
+    } else
+    {
+        yabove_row = x->dst.y_buffer - x->dst.y_stride;
+
+        for (i = 0; i < 16; i++)
+            yleft_buf[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
+        yleft_col = yleft_buf;
+    }
+
+    ytop_left = yabove_row[-1];
+
+    /* for Y */
+    switch (x->mode_info_context->mbmi.mode)
+    {
+    case DC_PRED:
+    {
+        int expected_dc;
+        int i;
+        int shift;
+        int average = 0;
+
+
+        if (x->up_available || x->left_available)
+        {
+            if (x->up_available)
+            {
+                for (i = 0; i < 16; i++)
+                {
+                    average += yabove_row[i];
+                }
+            }
+
+            if (x->left_available)
+            {
+
+                for (i = 0; i < 16; i++)
+                {
+                    average += yleft_col[i];
+                }
+
+            }
+
+
+
+            shift = 3 + x->up_available + x->left_available;
+            expected_dc = (average + (1 << (shift - 1))) >> shift;
+        }
+        else
+        {
+            expected_dc = 128;
+        }
+
+        vpx_memset(ypred_ptr, expected_dc, 256);
+    }
+    break;
+    case V_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
+            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
+            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
+            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+            ypred_ptr += 16;
+        }
+    }
+    break;
+    case H_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            ypred_ptr += 16;
+        }
+
+    }
+    break;
+    case TM_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+            for (c = 0; c < 16; c++)
+            {
+                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                ypred_ptr[c] = pred;
+            }
+
+            ypred_ptr += 16;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
+
+void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
+    unsigned char *yleft_col;
+    unsigned char yleft_buf[16];
+    unsigned char ytop_left;      /* = yabove_row[-1]; */
+    unsigned char *ypred_ptr = x->predictor;
+    int r, c, i;
+
+    int y_stride = x->dst.y_stride;
+    ypred_ptr = x->dst.y_buffer; /*x->predictor;*/
+
+    if (pbi->common.filter_level)
+    {
+        yabove_row = pbi->mt_yabove_row[mb_row] + mb_col*16 +32;
+        yleft_col = pbi->mt_yleft_col[mb_row];
+    } else
+    {
+        yabove_row = x->dst.y_buffer - x->dst.y_stride;
+
+        for (i = 0; i < 16; i++)
+            yleft_buf[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
+        yleft_col = yleft_buf;
+    }
+
+    ytop_left = yabove_row[-1];
+
+    /* for Y */
+    switch (x->mode_info_context->mbmi.mode)
+    {
+    case DC_PRED:
+    {
+        int expected_dc;
+        int i;
+        int shift;
+        int average = 0;
+
+
+        if (x->up_available || x->left_available)
+        {
+            if (x->up_available)
+            {
+                for (i = 0; i < 16; i++)
+                {
+                    average += yabove_row[i];
+                }
+            }
+
+            if (x->left_available)
+            {
+
+                for (i = 0; i < 16; i++)
+                {
+                    average += yleft_col[i];
+                }
+
+            }
+
+
+
+            shift = 3 + x->up_available + x->left_available;
+            expected_dc = (average + (1 << (shift - 1))) >> shift;
+        }
+        else
+        {
+            expected_dc = 128;
+        }
+
+        /*vpx_memset(ypred_ptr, expected_dc, 256);*/
+        for (r = 0; r < 16; r++)
+        {
+            vpx_memset(ypred_ptr, expected_dc, 16);
+            ypred_ptr += y_stride; /*16;*/
+        }
+    }
+    break;
+    case V_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
+            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
+            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
+            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+            ypred_ptr += y_stride; /*16;*/
+        }
+    }
+    break;
+    case H_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            ypred_ptr += y_stride;  /*16;*/
+        }
+
+    }
+    break;
+    case TM_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+            for (c = 0; c < 16; c++)
+            {
+                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                ypred_ptr[c] = pred;
+            }
+
+            ypred_ptr += y_stride;  /*16;*/
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
+
+void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */
+    unsigned char *uleft_col;    /*[16];*/
+    unsigned char uleft_buf[8];
+    unsigned char utop_left;     /* = uabove_row[-1]; */
+    unsigned char *vabove_row;   /* = x->dst.v_buffer - x->dst.uv_stride; */
+    unsigned char *vleft_col;    /*[20];*/
+    unsigned char vleft_buf[8];
+    unsigned char vtop_left;     /* = vabove_row[-1]; */
+    unsigned char *upred_ptr = &x->predictor[256];
+    unsigned char *vpred_ptr = &x->predictor[320];
+    int i, j;
+
+    if (pbi->common.filter_level)
+    {
+        uabove_row = pbi->mt_uabove_row[mb_row] + mb_col*8 +16;
+        vabove_row = pbi->mt_vabove_row[mb_row] + mb_col*8 +16;
+        uleft_col = pbi->mt_uleft_col[mb_row];
+        vleft_col = pbi->mt_vleft_col[mb_row];
+    } else
+    {
+        uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+        vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+
+        for (i = 0; i < 8; i++)
+        {
+            uleft_buf[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+            vleft_buf[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+        }
+        uleft_col = uleft_buf;
+        vleft_col = vleft_buf;
+    }
+    utop_left = uabove_row[-1];
+    vtop_left = vabove_row[-1];
+
+    switch (x->mode_info_context->mbmi.uv_mode)
+    {
+    case DC_PRED:
+    {
+        int expected_udc;
+        int expected_vdc;
+        int i;
+        int shift;
+        int Uaverage = 0;
+        int Vaverage = 0;
+
+        if (x->up_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+        }
+
+        if (!x->up_available && !x->left_available)
+        {
+            expected_udc = 128;
+            expected_vdc = 128;
+        }
+        else
+        {
+            shift = 2 + x->up_available + x->left_available;
+            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+        }
+
+
+        vpx_memset(upred_ptr, expected_udc, 64);
+        vpx_memset(vpred_ptr, expected_vdc, 64);
+
+
+    }
+    break;
+    case V_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memcpy(upred_ptr, uabove_row, 8);
+            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+
+    }
+    break;
+    case H_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, uleft_col[i], 8);
+            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+    }
+
+    break;
+    case TM_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            for (j = 0; j < 8; j++)
+            {
+                int predu = uleft_col[i] + uabove_row[j] - utop_left;
+                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+
+                if (predu < 0)
+                    predu = 0;
+
+                if (predu > 255)
+                    predu = 255;
+
+                if (predv < 0)
+                    predv = 0;
+
+                if (predv > 255)
+                    predv = 255;
+
+                upred_ptr[j] = predu;
+                vpred_ptr[j] = predv;
+            }
+
+            upred_ptr += 8;
+            vpred_ptr += 8;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
+
+void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */
+    unsigned char *uleft_col;   /*[16];*/
+    unsigned char uleft_buf[8];
+    unsigned char utop_left;    /* = uabove_row[-1]; */
+    unsigned char *vabove_row;  /* = x->dst.v_buffer - x->dst.uv_stride; */
+    unsigned char *vleft_col;   /*[20];*/
+    unsigned char vleft_buf[8];
+    unsigned char vtop_left;    /* = vabove_row[-1]; */
+    unsigned char *upred_ptr = x->dst.u_buffer; /*&x->predictor[256];*/
+    unsigned char *vpred_ptr = x->dst.v_buffer; /*&x->predictor[320];*/
+    int uv_stride = x->dst.uv_stride;
+    int i, j;
+
+    if (pbi->common.filter_level)
+    {
+        uabove_row = pbi->mt_uabove_row[mb_row] + mb_col*8 +16;
+        vabove_row = pbi->mt_vabove_row[mb_row] + mb_col*8 +16;
+        uleft_col = pbi->mt_uleft_col[mb_row];
+        vleft_col = pbi->mt_vleft_col[mb_row];
+    } else
+    {
+        uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+        vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+
+        for (i = 0; i < 8; i++)
+        {
+            uleft_buf[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+            vleft_buf[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+        }
+        uleft_col = uleft_buf;
+        vleft_col = vleft_buf;
+    }
+    utop_left = uabove_row[-1];
+    vtop_left = vabove_row[-1];
+
+    switch (x->mode_info_context->mbmi.uv_mode)
+    {
+    case DC_PRED:
+    {
+        int expected_udc;
+        int expected_vdc;
+        int i;
+        int shift;
+        int Uaverage = 0;
+        int Vaverage = 0;
+
+        if (x->up_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+        }
+
+        if (!x->up_available && !x->left_available)
+        {
+            expected_udc = 128;
+            expected_vdc = 128;
+        }
+        else
+        {
+            shift = 2 + x->up_available + x->left_available;
+            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+        }
+
+
+        /*vpx_memset(upred_ptr,expected_udc,64);
+        vpx_memset(vpred_ptr,expected_vdc,64);*/
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, expected_udc, 8);
+            vpx_memset(vpred_ptr, expected_vdc, 8);
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
+        }
+    }
+    break;
+    case V_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memcpy(upred_ptr, uabove_row, 8);
+            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
+        }
+
+    }
+    break;
+    case H_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, uleft_col[i], 8);
+            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
+        }
+    }
+
+    break;
+    case TM_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            for (j = 0; j < 8; j++)
+            {
+                int predu = uleft_col[i] + uabove_row[j] - utop_left;
+                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+
+                if (predu < 0)
+                    predu = 0;
+
+                if (predu > 255)
+                    predu = 255;
+
+                if (predv < 0)
+                    predv = 0;
+
+                if (predv > 255)
+                    predv = 255;
+
+                upred_ptr[j] = predu;
+                vpred_ptr[j] = predv;
+            }
+
+            upred_ptr += uv_stride; /*8;*/
+            vpred_ptr += uv_stride; /*8;*/
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
+
+
+void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
+                          MACROBLOCKD *xd,
+                          int b_mode,
+                          unsigned char *predictor,
+                          int mb_row,
+                          int mb_col,
+                          int num)
+{
+#if CONFIG_MULTITHREAD
+    int i, r, c;
+
+    unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */
+    unsigned char Left[4];
+    unsigned char top_left; /* = Above[-1]; */
+
+    BLOCKD *x = &xd->block[num];
+
+    /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
+    if (num < 4 && pbi->common.filter_level)
+        Above = pbi->mt_yabove_row[mb_row] + mb_col*16 + num*4 + 32;
+    else
+        Above = *(x->base_dst) + x->dst - x->dst_stride;
+
+    if (num%4==0 && pbi->common.filter_level)
+    {
+        for (i=0; i<4; i++)
+            Left[i] = pbi->mt_yleft_col[mb_row][num + i];
+    }else
+    {
+        Left[0] = (*(x->base_dst))[x->dst - 1];
+        Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
+        Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
+        Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+    }
+
+    if ((num==4 || num==8 || num==12) && pbi->common.filter_level)
+        top_left = pbi->mt_yleft_col[mb_row][num-1];
+    else
+        top_left = Above[-1];
+
+     switch (b_mode)
+    {
+    case B_DC_PRED:
+    {
+        int expected_dc = 0;
+
+        for (i = 0; i < 4; i++)
+        {
+            expected_dc += Above[i];
+            expected_dc += Left[i];
+        }
+
+        expected_dc = (expected_dc + 4) >> 3;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                predictor[c] = expected_dc;
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+    case B_TM_PRED:
+    {
+        /* prediction similar to true_motion prediction */
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                int pred = Above[c] - top_left + Left[r];
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                predictor[c] = pred;
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+
+    case B_VE_PRED:
+    {
+
+        unsigned int ap[4];
+        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
+        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
+        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
+        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+
+                predictor[c] = ap[c];
+            }
+
+            predictor += 16;
+        }
+
+    }
+    break;
+
+
+    case B_HE_PRED:
+    {
+
+        unsigned int lp[4];
+        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
+        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
+        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
+        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                predictor[c] = lp[r];
+            }
+
+            predictor += 16;
+        }
+    }
+    break;
+    case B_LD_PRED:
+    {
+        unsigned char *ptr = Above;
+        predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+        predictor[0 * 16 + 1] =
+            predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+        predictor[0 * 16 + 2] =
+            predictor[1 * 16 + 1] =
+                predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+        predictor[0 * 16 + 3] =
+            predictor[1 * 16 + 2] =
+                predictor[2 * 16 + 1] =
+                    predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+        predictor[1 * 16 + 3] =
+            predictor[2 * 16 + 2] =
+                predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+        predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+
+    }
+    break;
+    case B_RD_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+        predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[3 * 16 + 1] =
+            predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[3 * 16 + 2] =
+            predictor[2 * 16 + 1] =
+                predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[3 * 16 + 3] =
+            predictor[2 * 16 + 2] =
+                predictor[1 * 16 + 1] =
+                    predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[1 * 16 + 2] =
+                predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[1 * 16 + 3] =
+            predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+
+    }
+    break;
+    case B_VR_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[3 * 16 + 1] =
+            predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 1] =
+            predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
+        predictor[3 * 16 + 2] =
+            predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
+        predictor[3 * 16 + 3] =
+            predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        predictor[2 * 16 + 3] =
+            predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
+        predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
+
+    }
+    break;
+    case B_VL_PRED:
+    {
+
+        unsigned char *pp = Above;
+
+        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[2 * 16 + 0] =
+            predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[1 * 16 + 1] =
+            predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 1] =
+            predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[3 * 16 + 1] =
+            predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[0 * 16 + 3] =
+            predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+    case B_HD_PRED:
+    {
+        unsigned char pp[9];
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[2 * 16 + 0] =
+            predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[2 * 16 + 1] =
+            predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[2 * 16 + 3] =
+            predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        predictor[1 * 16 + 2] =
+            predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+
+    case B_HU_PRED:
+    {
+        unsigned char *pp = Left;
+        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+        predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        predictor[0 * 16 + 2] =
+            predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
+        predictor[0 * 16 + 3] =
+            predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        predictor[1 * 16 + 2] =
+            predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+        predictor[1 * 16 + 3] =
+            predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+        predictor[2 * 16 + 2] =
+            predictor[2 * 16 + 3] =
+                predictor[3 * 16 + 0] =
+                    predictor[3 * 16 + 1] =
+                        predictor[3 * 16 + 2] =
+                            predictor[3 * 16 + 3] = pp[3];
+    }
+    break;
+
+
+    }
+#else
+    (void) pbi;
+    (void) xd;
+    (void) b_mode;
+    (void) predictor;
+    (void) mb_row;
+    (void) mb_col;
+    (void) num;
+#endif
+}
+
+/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
+ * to the right prediction have filled in pixels to use.
+ */
+void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
+{
+#if CONFIG_MULTITHREAD
+    unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */
+    unsigned int *src_ptr;
+    unsigned int *dst_ptr0;
+    unsigned int *dst_ptr1;
+    unsigned int *dst_ptr2;
+
+    if (pbi->common.filter_level)
+        above_right = pbi->mt_yabove_row[mb_row] + mb_col*16 + 32 +16;
+    else
+        above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
+
+    src_ptr = (unsigned int *)above_right;
+    /*dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
+    dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
+    dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);*/
+    dst_ptr0 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 3 * x->block[0].dst_stride);
+    dst_ptr1 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 7 * x->block[0].dst_stride);
+    dst_ptr2 = (unsigned int *)(*(x->block[0].base_dst) + x->block[0].dst + 16 + 11 * x->block[0].dst_stride);
+    *dst_ptr0 = *src_ptr;
+    *dst_ptr1 = *src_ptr;
+    *dst_ptr2 = *src_ptr;
+#else
+    (void) pbi;
+    (void) x;
+    (void) mb_row;
+    (void) mb_col;
+#endif
+}
diff --git a/vp8/decoder/reconintra_mt.h b/vp8/decoder/reconintra_mt.h
new file mode 100644
index 000000000..d401295b2
--- /dev/null
+++ b/vp8/decoder/reconintra_mt.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA_MT_H
+#define __INC_RECONINTRA_MT_H
+
+/* reconintra functions used in multi-threaded decoder */
+#if CONFIG_MULTITHREAD
+extern void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+extern void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+extern void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+extern void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+
+extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int mb_row, int mb_col, int num);
+extern void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
+#endif
+
+#endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index a77552c3c..fc2fad516 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -22,18 +22,19 @@
 #include "loopfilter.h"
 #include "extend.h"
 #include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#include "reconinter.h"
+#include "reconintra_mt.h"
 
-#define MAX_ROWS 256
-
-extern void vp8_decode_mb_row(VP8D_COMP *pbi,
-                              VP8_COMMON *pc,
-                              int mb_row,
-                              MACROBLOCKD *xd);
-
+extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+extern void clamp_mvs(MACROBLOCKD *xd);
 extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
-extern void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd);
 
-void vp8_thread_loop_filter(VP8D_COMP *pbi, MB_ROW_DEC *mbrd, int ithread);
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
+#else
+#define RTCD_VTABLE(x) NULL
+#endif
 
 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
@@ -68,6 +69,15 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
         mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
         vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
 
+        /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
+        vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+        /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
+        vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+        /*unsigned char mode_ref_lf_delta_enabled;
+        unsigned char mode_ref_lf_delta_update;*/
+        mbd->mode_ref_lf_delta_enabled    = xd->mode_ref_lf_delta_enabled;
+        mbd->mode_ref_lf_delta_update    = xd->mode_ref_lf_delta_update;
+
         mbd->current_bc = &pbi->bc2;
 
         for (j = 0; j < 25; j++)
@@ -77,7 +87,7 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
     }
 
     for (i=0; i< pc->mb_rows; i++)
-        pbi->current_mb_col[i]=-1;
+        pbi->mt_current_mb_col[i]=-1;
 #else
     (void) pbi;
     (void) xd;
@@ -86,69 +96,141 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
 #endif
 }
 
-void vp8_setup_loop_filter_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+
+void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
 #if CONFIG_MULTITHREAD
-    VP8_COMMON *const pc = & pbi->common;
-    int i, j;
+    int eobtotal = 0;
+    int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
+    VP8_COMMON *pc = &pbi->common;
 
-    for (i = 0; i < count; i++)
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
     {
-        MACROBLOCKD *mbd = &mbrd[i].mbd;
-//#if CONFIG_RUNTIME_CPU_DETECT
-//        mbd->rtcd = xd->rtcd;
-//#endif
+        vp8_reset_mb_tokens_context(xd);
+    }
+    else
+    {
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+    }
 
-        //mbd->subpixel_predict        = xd->subpixel_predict;
-        //mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
-        //mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
-        //mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+    /* Perform temporary clamping of the MV to be used for prediction */
+    if (do_clamp)
+    {
+        clamp_mvs(xd);
+    }
 
-        mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
-        mbd->mode_info_stride  = pc->mode_info_stride;
+    xd->mode_info_context->mbmi.dc_diff = 1;
 
-        //mbd->frame_type = pc->frame_type;
-        //mbd->frames_since_golden      = pc->frames_since_golden;
-        //mbd->frames_till_alt_ref_frame  = pc->frames_till_alt_ref_frame;
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+    {
+        xd->mode_info_context->mbmi.dc_diff = 0;
+
+        /*mt_skip_recon_mb(pbi, xd, mb_row, mb_col);*/
+        if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+        {
+            vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);
+            vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
+        }
+        else
+        {
+            vp8_build_inter_predictors_mb_s(xd);
+        }
+        return;
+    }
 
-        //mbd->pre = pc->yv12_fb[pc->lst_fb_idx];
-        //mbd->dst = pc->yv12_fb[pc->new_fb_idx];
+    if (xd->segmentation_enabled)
+        mb_init_dequantizer(pbi, xd);
 
-        //vp8_setup_block_dptrs(mbd);
-        //vp8_build_block_doffsets(mbd);
-        mbd->segmentation_enabled    = xd->segmentation_enabled;  //
-        mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;  //
-        vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));   //
+    /* do prediction */
+    if (xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8mt_build_intra_predictors_mbuv(pbi, xd, mb_row, mb_col);
 
-        //signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
-        vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
-        //signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];
-        vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
-        //unsigned char mode_ref_lf_delta_enabled;
-        //unsigned char mode_ref_lf_delta_update;
-        mbd->mode_ref_lf_delta_enabled    = xd->mode_ref_lf_delta_enabled;
-        mbd->mode_ref_lf_delta_update    = xd->mode_ref_lf_delta_update;
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+        {
+            vp8mt_build_intra_predictors_mby(pbi, xd, mb_row, mb_col);
+        } else {
+            vp8mt_intra_prediction_down_copy(pbi, xd, mb_row, mb_col);
+        }
+    }
+    else
+    {
+        vp8_build_inter_predictors_mb(xd);
+    }
 
-        //mbd->mbmi.mode = DC_PRED;
-        //mbd->mbmi.uv_mode = DC_PRED;
-        //mbd->current_bc = &pbi->bc2;
+    /* dequantization and idct */
+    if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        BLOCKD *b = &xd->block[24];
+        DEQUANT_INVOKE(&pbi->dequant, block)(b);
+
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
+        {
+            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+            ((int *)b->qcoeff)[0] = 0;
+            ((int *)b->qcoeff)[1] = 0;
+            ((int *)b->qcoeff)[2] = 0;
+            ((int *)b->qcoeff)[3] = 0;
+            ((int *)b->qcoeff)[4] = 0;
+            ((int *)b->qcoeff)[5] = 0;
+            ((int *)b->qcoeff)[6] = 0;
+            ((int *)b->qcoeff)[7] = 0;
+        }
+        else
+        {
+            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+            ((int *)b->qcoeff)[0] = 0;
+        }
 
-        //for (j = 0; j < 25; j++)
-        //{
-        //    mbd->block[j].dequant = xd->block[j].dequant;
-        //}
+        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
+                        (xd->qcoeff, xd->block[0].dequant,
+                         xd->predictor, xd->dst.y_buffer,
+                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
     }
+    else if ((xd->frame_type == KEY_FRAME  ||  xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            BLOCKD *b = &xd->block[i];
+            vp8mt_predict_intra4x4(pbi, xd, b->bmi.mode, b->predictor, mb_row, mb_col, i);
 
-    for (i=0; i< pc->mb_rows; i++)
-        pbi->current_mb_col[i]=-1;
+            if (xd->eobs[i] > 1)
+            {
+                DEQUANT_INVOKE(&pbi->dequant, idct_add)
+                    (b->qcoeff, b->dequant,  b->predictor,
+                    *(b->base_dst) + b->dst, 16, b->dst_stride);
+            }
+            else
+            {
+                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+                    (b->qcoeff[0] * b->dequant[0], b->predictor,
+                    *(b->base_dst) + b->dst, 16, b->dst_stride);
+                ((int *)b->qcoeff)[0] = 0;
+            }
+        }
+    }
+    else
+    {
+        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+                        (xd->qcoeff, xd->block[0].dequant,
+                         xd->predictor, xd->dst.y_buffer,
+                         xd->dst.y_stride, xd->eobs);
+    }
+
+    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->block[16].dequant,
+                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
 #else
     (void) pbi;
     (void) xd;
-    (void) mbrd;
-    (void) count;
+    (void) mb_row;
+    (void) mb_col;
 #endif
 }
 
+
 THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 {
 #if CONFIG_MULTITHREAD
@@ -159,12 +241,10 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 
     while (1)
     {
-        int current_filter_level = 0;
-
         if (pbi->b_multithreaded_rd == 0)
             break;
 
-        //if(WaitForSingleObject(pbi->h_event_start_decoding[ithread], INFINITE) == WAIT_OBJECT_0)
+        /*if(WaitForSingleObject(pbi->h_event_start_decoding[ithread], INFINITE) == WAIT_OBJECT_0)*/
         if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0)
         {
             if (pbi->b_multithreaded_rd == 0)
@@ -177,6 +257,7 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                 int mb_row;
                 int num_part = 1 << pbi->common.multi_token_partition;
                 volatile int *last_row_current_mb_col;
+                int nsync = pbi->sync_range;
 
                 for (mb_row = ithread+1; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
                 {
@@ -188,14 +269,19 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                     int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
                     int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
 
+                    int filter_level;
+                    loop_filter_info *lfi = pc->lf_info;
+                    int alt_flt_enabled = xd->segmentation_enabled;
+                    int Segment;
+
                     pbi->mb_row_di[ithread].mb_row = mb_row;
                     pbi->mb_row_di[ithread].mbd.current_bc =  &pbi->mbc[mb_row%num_part];
 
-                    last_row_current_mb_col = &pbi->current_mb_col[mb_row -1];
+                    last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
 
                     recon_yoffset = mb_row * recon_y_stride * 16;
                     recon_uvoffset = mb_row * recon_uv_stride * 8;
-                    // reset above block coeffs
+                    /* reset above block coeffs */
 
                     xd->above_context = pc->above_context;
                     xd->left_context = &mb_row_left_context;
@@ -207,9 +293,9 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 
                     for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
                     {
-                        if ((mb_col & 7) == 0)
+                        if ((mb_col & (nsync-1)) == 0)
                         {
-                            while (mb_col > (*last_row_current_mb_col - 8) && *last_row_current_mb_col != pc->mb_cols - 1)
+                            while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
                             {
                                 x86_pause_hint();
                                 thread_sleep(0);
@@ -225,8 +311,21 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                             }
                         }
 
-                        // Distance of Mb to the various image edges.
-                        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                        if(pbi->common.filter_level)
+                        {
+                            /*update loopfilter info*/
+                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                            filter_level = pbi->mt_baseline_filter_level[Segment];
+                            /* Distance of Mb to the various image edges.
+                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                             * Apply any context driven MB level adjustment
+                             */
+                            vp8_adjust_mb_lf_value(xd, &filter_level);
+                        }
+
+                        /* Distance of Mb to the various image edges.
+                         * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                         */
                         xd->mb_to_left_edge = -((mb_col * 16) << 3);
                         xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
 
@@ -236,7 +335,7 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 
                         xd->left_available = (mb_col != 0);
 
-                        // Select the appropriate reference frame for this MB
+                        /* Select the appropriate reference frame for this MB */
                         if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
                             ref_fb_idx = pc->lst_fb_idx;
                         else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
@@ -249,8 +348,52 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
                         xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
 
                         vp8_build_uvmvs(xd, pc->full_pixel);
+                        vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
 
-                        vp8_decode_macroblock(pbi, xd);
+                        if (pbi->common.filter_level)
+                        {
+                            if( mb_row != pc->mb_rows-1 )
+                            {
+                                /* Save decoded MB last row data for next-row decoding */
+                                vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                                vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                                vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+                            }
+
+                            /* save left_col for next MB decoding */
+                            if(mb_col != pc->mb_cols-1)
+                            {
+                                MODE_INFO *next = xd->mode_info_context +1;
+
+                                if (xd->frame_type == KEY_FRAME  ||  next->mbmi.ref_frame == INTRA_FRAME)
+                                {
+                                    for (i = 0; i < 16; i++)
+                                        pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+                                    for (i = 0; i < 8; i++)
+                                    {
+                                        pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+                                        pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+                                    }
+                                }
+                            }
+
+                          /* loopfilter on this macroblock. */
+                            if (filter_level)
+                            {
+                                if (mb_col > 0)
+                                    pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                                if (xd->mode_info_context->mbmi.dc_diff > 0)
+                                    pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                                /* don't apply across umv border */
+                                if (mb_row > 0)
+                                    pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+
+                                if (xd->mode_info_context->mbmi.dc_diff > 0)
+                                    pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                            }
+                        }
 
                         recon_yoffset += 16;
                         recon_uvoffset += 8;
@@ -259,41 +402,41 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 
                         xd->above_context++;
 
-                        //pbi->mb_row_di[ithread].current_mb_col = mb_col;
-                        pbi->current_mb_col[mb_row] = mb_col;
+                        /*pbi->mb_row_di[ithread].current_mb_col = mb_col;*/
+                        pbi->mt_current_mb_col[mb_row] = mb_col;
                     }
 
-                    // adjust to the next row of mbs
-                    vp8_extend_mb_row(
-                    &pc->yv12_fb[dst_fb_idx],
-                    xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
-                    );
+                    /* adjust to the next row of mbs */
+                    if (pbi->common.filter_level)
+                    {
+                        if(mb_row != pc->mb_rows-1)
+                        {
+                            int lasty = pc->yv12_fb[ref_fb_idx].y_width + VP8BORDERINPIXELS;
+                            int lastuv = (pc->yv12_fb[ref_fb_idx].y_width>>1) + (VP8BORDERINPIXELS>>1);
+
+                            for (i = 0; i < 4; i++)
+                            {
+                                pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+                                pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+                                pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+                            }
+                        }
+                    } else
+                        vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
                     ++xd->mode_info_context;      /* skip prediction column */
 
-                    // since we have multithread
+                    /* since we have multithread */
                     xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
-
-                    pbi->last_mb_row_decoded = mb_row;
-
                 }
             }
         }
-
-        // If |pbi->common.filter_level| is 0 the value can change in-between
-        // the sem_post and the check to call vp8_thread_loop_filter.
-        current_filter_level = pbi->common.filter_level;
-
-        //  add this to each frame
+        /*  add this to each frame */
         if ((mbrd->mb_row == pbi->common.mb_rows-1) || ((mbrd->mb_row == pbi->common.mb_rows-2) && (pbi->common.mb_rows % (pbi->decoding_thread_count+1))==1))
         {
-            //SetEvent(pbi->h_event_end_decoding);
+            /*SetEvent(pbi->h_event_end_decoding);*/
             sem_post(&pbi->h_event_end_decoding);
         }
-
-        if ((pbi->b_multithreaded_lf) && (current_filter_level))
-            vp8_thread_loop_filter(pbi, mbrd, ithread);
-
     }
 #else
     (void) p_data;
@@ -303,180 +446,227 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 }
 
 
-void vp8_thread_loop_filter(VP8D_COMP *pbi, MB_ROW_DEC *mbrd, int ithread)
+void vp8_decoder_create_threads(VP8D_COMP *pbi)
 {
 #if CONFIG_MULTITHREAD
+    int core_count = 0;
+    int ithread;
+    int i;
 
-        if (sem_wait(&pbi->h_event_start_lpf[ithread]) == 0)
-        {
-           // if (pbi->b_multithreaded_lf == 0) // we're shutting down      ????
-           //     break;
-           // else
-            {
-                VP8_COMMON *cm  = &pbi->common;
-                MACROBLOCKD *mbd = &mbrd->mbd;
-                int default_filt_lvl = pbi->common.filter_level;
-
-                YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-                loop_filter_info *lfi = cm->lf_info;
-                //int frame_type = cm->frame_type;
-
-                int mb_row;
-                int mb_col;
-
-                int filter_level;
-                int alt_flt_enabled = mbd->segmentation_enabled;
-
-                int i;
-                unsigned char *y_ptr, *u_ptr, *v_ptr;
+    pbi->b_multithreaded_rd = 0;
+    pbi->allocated_decoding_thread_count = 0;
+    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads;
 
-                volatile int *last_row_current_mb_col;
+    if (core_count > 1)
+    {
+        pbi->b_multithreaded_rd = 1;
+        pbi->decoding_thread_count = core_count -1;
 
-                // Set up the buffer pointers
-                y_ptr = post->y_buffer + post->y_stride  * 16 * (ithread +1);
-                u_ptr = post->u_buffer + post->uv_stride *  8 * (ithread +1);
-                v_ptr = post->v_buffer + post->uv_stride *  8 * (ithread +1);
+        CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
+        CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
+        CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
+        vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
+        CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
 
-                // vp8_filter each macro block
-                for (mb_row = ithread+1; mb_row < cm->mb_rows; mb_row+= (pbi->decoding_thread_count + 1))
-                {
-                    last_row_current_mb_col = &pbi->current_mb_col[mb_row -1];
+        for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
+        {
+            sem_init(&pbi->h_event_start_decoding[ithread], 0, 0);
 
-                    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
-                    {
-                        int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            pbi->de_thread_data[ithread].ithread  = ithread;
+            pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
+            pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
 
-                        if ((mb_col & 7) == 0)
-                        {
-                            while (mb_col > (*last_row_current_mb_col-8) && *last_row_current_mb_col != cm->mb_cols - 1)
-                            {
-                                x86_pause_hint();
-                                thread_sleep(0);
-                            }
-                        }
+            pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+        }
 
-                        filter_level = pbi->mt_baseline_filter_level[Segment];
+        sem_init(&pbi->h_event_end_decoding, 0, 0);
 
-                        // Apply any context driven MB level adjustment
-                        vp8_adjust_mb_lf_value(mbd, &filter_level);
+        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
+    }
 
-                        if (filter_level)
-                        {
-                            if (mb_col > 0)
-                                cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+#else
+    (void) pbi;
+#endif
+}
 
-                            if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                                cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
 
-                            // don't apply across umv border
-                            if (mb_row > 0)
-                                cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
+{
+#if CONFIG_MULTITHREAD
+    VP8_COMMON *const pc = & pbi->common;
+    int i;
 
-                            if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                                cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
-                        }
+    if (pbi->b_multithreaded_rd)
+    {
+        if (pbi->mt_current_mb_col)
+        {
+            vpx_free(pbi->mt_current_mb_col);
+            pbi->mt_current_mb_col = NULL ;
+        }
 
-                        y_ptr += 16;
-                        u_ptr += 8;
-                        v_ptr += 8;
+        /* Free above_row buffers. */
+        if (pbi->mt_yabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_yabove_row[i])
+                {
+                    vpx_free(pbi->mt_yabove_row[i]);
+                    pbi->mt_yabove_row[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_yabove_row);
+            pbi->mt_yabove_row = NULL ;
+        }
 
-                        mbd->mode_info_context++;     // step to next MB
-                        pbi->current_mb_col[mb_row] = mb_col;
-                    }
+        if (pbi->mt_uabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_uabove_row[i])
+                {
+                    vpx_free(pbi->mt_uabove_row[i]);
+                    pbi->mt_uabove_row[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_uabove_row);
+            pbi->mt_uabove_row = NULL ;
+        }
 
-                    mbd->mode_info_context++;         // Skip border mb
+        if (pbi->mt_vabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_vabove_row[i])
+                {
+                    vpx_free(pbi->mt_vabove_row[i]);
+                    pbi->mt_vabove_row[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_vabove_row);
+            pbi->mt_vabove_row = NULL ;
+        }
 
-                    y_ptr += post->y_stride  * 16 * (pbi->decoding_thread_count + 1) - post->y_width;
-                    u_ptr += post->uv_stride *  8 * (pbi->decoding_thread_count + 1) - post->uv_width;
-                    v_ptr += post->uv_stride *  8 * (pbi->decoding_thread_count + 1) - post->uv_width;
+        /* Free left_col buffers. */
+        if (pbi->mt_yleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_yleft_col[i])
+                {
+                    vpx_free(pbi->mt_yleft_col[i]);
+                    pbi->mt_yleft_col[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_yleft_col);
+            pbi->mt_yleft_col = NULL ;
+        }
 
-                    mbd->mode_info_context += pbi->decoding_thread_count * mbd->mode_info_stride;         // Skip border mb
+        if (pbi->mt_uleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_uleft_col[i])
+                {
+                    vpx_free(pbi->mt_uleft_col[i]);
+                    pbi->mt_uleft_col[i] = NULL ;
                 }
             }
+            vpx_free(pbi->mt_uleft_col);
+            pbi->mt_uleft_col = NULL ;
         }
 
-        //  add this to each frame
-        if ((mbrd->mb_row == pbi->common.mb_rows-1) || ((mbrd->mb_row == pbi->common.mb_rows-2) && (pbi->common.mb_rows % (pbi->decoding_thread_count+1))==1))
+        if (pbi->mt_vleft_col)
         {
-          sem_post(&pbi->h_event_end_lpf);
+            for (i=0; i< mb_rows; i++)
+            {
+                if (pbi->mt_vleft_col[i])
+                {
+                    vpx_free(pbi->mt_vleft_col[i]);
+                    pbi->mt_vleft_col[i] = NULL ;
+                }
+            }
+            vpx_free(pbi->mt_vleft_col);
+            pbi->mt_vleft_col = NULL ;
         }
+    }
 #else
     (void) pbi;
 #endif
 }
 
-void vp8_decoder_create_threads(VP8D_COMP *pbi)
+
+int vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
 #if CONFIG_MULTITHREAD
-    int core_count = 0;
-    int ithread;
-
-    pbi->b_multithreaded_rd = 0;
-    pbi->b_multithreaded_lf = 0;
-    pbi->allocated_decoding_thread_count = 0;
-    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads;
+    VP8_COMMON *const pc = & pbi->common;
+    int i;
+    int uv_width;
 
-    if (core_count > 1)
+    if (pbi->b_multithreaded_rd)
     {
-        pbi->b_multithreaded_rd = 1;
-        pbi->b_multithreaded_lf = 1;  // this can be merged with pbi->b_multithreaded_rd ?
-        pbi->decoding_thread_count = core_count -1;
+        vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
 
-        CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
-        CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
-        vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
-        CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
+        /* our internal buffers are always multiples of 16 */
+        if ((width & 0xf) != 0)
+            width += 16 - (width & 0xf);
 
-        CHECK_MEM_ERROR(pbi->current_mb_col, vpx_malloc(sizeof(int) * MAX_ROWS));  // pc->mb_rows));
-        CHECK_MEM_ERROR(pbi->h_event_start_lpf, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
+        if (width < 640) pbi->sync_range = 1;
+        else if (width <= 1280) pbi->sync_range = 8;
+        else if (width <= 2560) pbi->sync_range =16;
+        else pbi->sync_range = 32;
 
-        for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
-        {
-            sem_init(&pbi->h_event_start_decoding[ithread], 0, 0);
-            sem_init(&pbi->h_event_start_lpf[ithread], 0, 0);
+        uv_width = width >>1;
 
-            pbi->de_thread_data[ithread].ithread  = ithread;
-            pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
-            pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
+        /* Allocate an int for each mb row. */
+        CHECK_MEM_ERROR(pbi->mt_current_mb_col, vpx_malloc(sizeof(int) * pc->mb_rows));
 
-            pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
-        }
+        /* Allocate memory for above_row buffers. */
+        CHECK_MEM_ERROR(pbi->mt_yabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_calloc(sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1)), 1));
 
-        sem_init(&pbi->h_event_end_decoding, 0, 0);
-        sem_init(&pbi->h_event_end_lpf, 0, 0);
+        CHECK_MEM_ERROR(pbi->mt_uabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1));
 
-        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
-    }
+        CHECK_MEM_ERROR(pbi->mt_vabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1));
+
+        /* Allocate memory for left_col buffers. */
+        CHECK_MEM_ERROR(pbi->mt_yleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));
 
+        CHECK_MEM_ERROR(pbi->mt_uleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+
+        CHECK_MEM_ERROR(pbi->mt_vleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
+        for (i=0; i< pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+    }
+    return 0;
 #else
     (void) pbi;
+    (void) width;
 #endif
 }
 
+
 void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 {
 #if CONFIG_MULTITHREAD
 
-    if (pbi->b_multithreaded_lf)
-    {
-        int i;
-        pbi->b_multithreaded_lf = 0;
-
-        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
-            sem_destroy(&pbi->h_event_start_lpf[i]);
-
-        sem_destroy(&pbi->h_event_end_lpf);
-    }
-
-    //shutdown MB Decoding thread;
+    /* shutdown MB Decoding thread; */
     if (pbi->b_multithreaded_rd)
     {
         int i;
 
         pbi->b_multithreaded_rd = 0;
 
-        // allow all threads to exit
+        /* allow all threads to exit */
         for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
         {
             sem_post(&pbi->h_event_start_decoding[i]);
@@ -502,12 +692,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
             pbi->h_event_start_decoding = NULL;
         }
 
-        if (pbi->h_event_start_lpf)
-        {
-            vpx_free(pbi->h_event_start_lpf);
-            pbi->h_event_start_lpf = NULL;
-        }
-
         if (pbi->mb_row_di)
         {
             vpx_free(pbi->mb_row_di);
@@ -519,12 +703,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
             vpx_free(pbi->de_thread_data);
             pbi->de_thread_data = NULL;
         }
-
-        if (pbi->current_mb_col)
-        {
-            vpx_free(pbi->current_mb_col);
-            pbi->current_mb_col = NULL ;
-        }
     }
 #else
     (void) pbi;
@@ -532,42 +710,59 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 }
 
 
-void vp8_start_lfthread(VP8D_COMP *pbi)
+void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
 #if CONFIG_MULTITHREAD
-  /*
-    memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
-    pbi->last_mb_row_decoded = 0;
-    sem_post(&pbi->h_event_start_lpf);
-    */
-    (void) pbi;
-#else
-    (void) pbi;
-#endif
-}
+    VP8_COMMON *cm  = &pbi->common;
+    MACROBLOCKD *mbd = &pbi->mb;
+    /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/
+    loop_filter_info *lfi = cm->lf_info;
+    FRAME_TYPE frame_type = cm->frame_type;
 
-void vp8_stop_lfthread(VP8D_COMP *pbi)
-{
-#if CONFIG_MULTITHREAD
-  /*
-    struct vpx_usec_timer timer;
+    /*int mb_row;
+    int mb_col;
+    int baseline_filter_level[MAX_MB_SEGMENTS];*/
+    int filter_level;
+    int alt_flt_enabled = mbd->segmentation_enabled;
 
-    vpx_usec_timer_start(&timer);
+    int i;
+    /*unsigned char *y_ptr, *u_ptr, *v_ptr;*/
 
-    sem_wait(&pbi->h_event_end_lpf);
+    /* Note the baseline filter values for each segment */
+    if (alt_flt_enabled)
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+        {
+            /* Abs value */
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+                pbi->mt_baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            /* Delta Value */
+            else
+            {
+                pbi->mt_baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                pbi->mt_baseline_filter_level[i] = (pbi->mt_baseline_filter_level[i] >= 0) ? ((pbi->mt_baseline_filter_level[i] <= MAX_LOOP_FILTER) ? pbi->mt_baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
+            }
+        }
+    }
+    else
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            pbi->mt_baseline_filter_level[i] = default_filt_lvl;
+    }
 
-    vpx_usec_timer_mark(&timer);
-    pbi->time_loop_filtering += vpx_usec_timer_elapsed(&timer);
-    */
-    (void) pbi;
+    /* Initialize the loop filter for this frame. */
+    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+        vp8_init_loop_filter(cm);
+    else if (frame_type != cm->last_frame_type)
+        vp8_frame_init_loop_filter(lfi, frame_type);
 #else
     (void) pbi;
+    (void) default_filt_lvl;
 #endif
 }
 
 
-void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
-                          MACROBLOCKD *xd)
+void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
 #if CONFIG_MULTITHREAD
     int mb_row;
@@ -575,8 +770,38 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
 
     int ibc = 0;
     int num_part = 1 << pbi->common.multi_token_partition;
-    int i;
+    int i, j;
     volatile int *last_row_current_mb_col = NULL;
+    int nsync = pbi->sync_range;
+
+    int filter_level;
+    loop_filter_info *lfi = pc->lf_info;
+    int alt_flt_enabled = xd->segmentation_enabled;
+    int Segment;
+
+    if(pbi->common.filter_level)
+    {
+        /* Set above_row buffer to 127 for decoding first MB row */
+        vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, pc->yv12_fb[pc->lst_fb_idx].y_width + 5);
+        vpx_memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (pc->yv12_fb[pc->lst_fb_idx].y_width>>1) +5);
+        vpx_memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (pc->yv12_fb[pc->lst_fb_idx].y_width>>1) +5);
+
+        for (i=1; i<pc->mb_rows; i++)
+        {
+            vpx_memset(pbi->mt_yabove_row[i] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+            vpx_memset(pbi->mt_uabove_row[i] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+            vpx_memset(pbi->mt_vabove_row[i] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+        }
+
+        /* Set left_col to 129 initially */
+        for (i=0; i<pc->mb_rows; i++)
+        {
+            vpx_memset(pbi->mt_yleft_col[i], (unsigned char)129, 16);
+            vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8);
+            vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8);
+        }
+        vp8mt_lpf_init(pbi, pc->filter_level);
+    }
 
     vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
 
@@ -589,7 +814,7 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
 
         xd->current_bc = &pbi->mbc[mb_row%num_part];
 
-        //vp8_decode_mb_row(pbi, pc, mb_row, xd);
+        /* vp8_decode_mb_row(pbi, pc, mb_row, xd); */
         {
             int i;
             int recon_yoffset, recon_uvoffset;
@@ -599,14 +824,14 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
             int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
             int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
 
-           // volatile int *last_row_current_mb_col = NULL;
+           /* volatile int *last_row_current_mb_col = NULL; */
             if (mb_row > 0)
-                last_row_current_mb_col = &pbi->current_mb_col[mb_row -1];
+                last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
 
             vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
             recon_yoffset = mb_row * recon_y_stride * 16;
             recon_uvoffset = mb_row * recon_uv_stride * 8;
-            // reset above block coeffs
+            /* reset above block coeffs */
 
             xd->above_context = pc->above_context;
             xd->up_available = (mb_row != 0);
@@ -616,8 +841,8 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
 
             for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
             {
-                if ( mb_row > 0 && (mb_col & 7) == 0){
-                    while (mb_col > (*last_row_current_mb_col - 8) && *last_row_current_mb_col != pc->mb_cols - 1)
+                if ( mb_row > 0 && (mb_col & (nsync-1)) == 0){
+                    while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != pc->mb_cols - 1)
                     {
                         x86_pause_hint();
                         thread_sleep(0);
@@ -633,8 +858,21 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
                     }
                 }
 
-                // Distance of Mb to the various image edges.
-                // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                if(pbi->common.filter_level)
+                {
+                    /* update loopfilter info */
+                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                    filter_level = pbi->mt_baseline_filter_level[Segment];
+                    /* Distance of Mb to the various image edges.
+                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                     * Apply any context driven MB level adjustment
+                     */
+                    vp8_adjust_mb_lf_value(xd, &filter_level);
+                }
+
+                /* Distance of Mb to the various image edges.
+                 * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                 */
                 xd->mb_to_left_edge = -((mb_col * 16) << 3);
                 xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
 
@@ -644,7 +882,7 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
 
                 xd->left_available = (mb_col != 0);
 
-                // Select the appropriate reference frame for this MB
+                /* Select the appropriate reference frame for this MB */
                 if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
                     ref_fb_idx = pc->lst_fb_idx;
                 else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
@@ -657,164 +895,89 @@ void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
                 xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
 
                 vp8_build_uvmvs(xd, pc->full_pixel);
+                vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
 
-                vp8_decode_macroblock(pbi, xd);
-
-                recon_yoffset += 16;
-                recon_uvoffset += 8;
-
-                ++xd->mode_info_context;  /* next mb */
-
-                xd->above_context++;
-
-                //pbi->current_mb_col_main = mb_col;
-                pbi->current_mb_col[mb_row] = mb_col;
-            }
-
-            // adjust to the next row of mbs
-            vp8_extend_mb_row(
-                &pc->yv12_fb[dst_fb_idx],
-                xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
-            );
-
-            ++xd->mode_info_context;      /* skip prediction column */
-
-            pbi->last_mb_row_decoded = mb_row;
-        }
-        xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
-    }
-
-    sem_wait(&pbi->h_event_end_decoding);   // add back for each frame
-#else
-    (void) pbi;
-    (void) xd;
-#endif
-}
-
-
-void vp8_mt_loop_filter_frame( VP8D_COMP *pbi)
-{
-#if CONFIG_MULTITHREAD
-    VP8_COMMON *cm  = &pbi->common;
-    MACROBLOCKD *mbd = &pbi->mb;
-    int default_filt_lvl = pbi->common.filter_level;
-
-    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-    loop_filter_info *lfi = cm->lf_info;
-    int frame_type = cm->frame_type;
-
-    int mb_row;
-    int mb_col;
-
-    int filter_level;
-    int alt_flt_enabled = mbd->segmentation_enabled;
-
-    int i;
-    unsigned char *y_ptr, *u_ptr, *v_ptr;
+                if (pbi->common.filter_level)
+                {
+                    /* Save decoded MB last row data for next-row decoding */
+                    if(mb_row != pc->mb_rows-1)
+                    {
+                        vpx_memcpy((pbi->mt_yabove_row[mb_row +1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                        vpx_memcpy((pbi->mt_uabove_row[mb_row +1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                        vpx_memcpy((pbi->mt_vabove_row[mb_row +1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+                    }
 
-    volatile int *last_row_current_mb_col=NULL;
+                    /* save left_col for next MB decoding */
+                    if(mb_col != pc->mb_cols-1)
+                    {
+                        MODE_INFO *next = xd->mode_info_context +1;
 
-    vp8_setup_loop_filter_thread_data(pbi, mbd, pbi->mb_row_di, pbi->decoding_thread_count);
+                        if (xd->frame_type == KEY_FRAME  ||  next->mbmi.ref_frame == INTRA_FRAME)
+                        {
+                            for (i = 0; i < 16; i++)
+                                pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+                            for (i = 0; i < 8; i++)
+                            {
+                                pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+                                pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+                            }
+                        }
+                    }
 
-    mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
+                    /* loopfilter on this macroblock. */
+                    if (filter_level)
+                    {
+                        if (mb_col > 0)
+                            pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
 
-    // Note the baseline filter values for each segment
-    if (alt_flt_enabled)
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {
-            // Abs value
-            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                pbi->mt_baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            // Delta Value
-            else
-            {
-                pbi->mt_baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                pbi->mt_baseline_filter_level[i] = (pbi->mt_baseline_filter_level[i] >= 0) ? ((pbi->mt_baseline_filter_level[i] <= MAX_LOOP_FILTER) ? pbi->mt_baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
-            }
-        }
-    }
-    else
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            pbi->mt_baseline_filter_level[i] = default_filt_lvl;
-    }
+                        if (xd->mode_info_context->mbmi.dc_diff > 0)
+                            pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
 
-    // Initialize the loop filter for this frame.
-    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
-        vp8_init_loop_filter(cm);
-    else if (frame_type != cm->last_frame_type)
-        vp8_frame_init_loop_filter(lfi, frame_type);
+                        /* don't apply across umv border */
+                        if (mb_row > 0)
+                            pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
 
-    for (i = 0; i < pbi->decoding_thread_count; i++)
-        sem_post(&pbi->h_event_start_lpf[i]);
-        // sem_post(&pbi->h_event_start_lpf);
+                        if (xd->mode_info_context->mbmi.dc_diff > 0)
+                            pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf);
+                    }
+                }
 
-    // Set up the buffer pointers
-    y_ptr = post->y_buffer;
-    u_ptr = post->u_buffer;
-    v_ptr = post->v_buffer;
+                recon_yoffset += 16;
+                recon_uvoffset += 8;
 
-    // vp8_filter each macro block
-    for (mb_row = 0; mb_row < cm->mb_rows; mb_row+= (pbi->decoding_thread_count + 1))
-    {
-        if (mb_row > 0)
-            last_row_current_mb_col = &pbi->current_mb_col[mb_row -1];
+                ++xd->mode_info_context;  /* next mb */
 
-        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
-        {
-            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+                xd->above_context++;
 
-            if ( mb_row > 0 && (mb_col & 7) == 0){
-            // if ( mb_row > 0 ){
-                while (mb_col > (*last_row_current_mb_col-8) && *last_row_current_mb_col != cm->mb_cols - 1)
-                {
-                    x86_pause_hint();
-                    thread_sleep(0);
-                }
+                pbi->mt_current_mb_col[mb_row] = mb_col;
             }
 
-            filter_level = pbi->mt_baseline_filter_level[Segment];
-
-            // Distance of Mb to the various image edges.
-            // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
-            // Apply any context driven MB level adjustment
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
-
-            if (filter_level)
+            /* adjust to the next row of mbs */
+            if (pbi->common.filter_level)
             {
-                if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
-
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
-
-                // don't apply across umv border
-                if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
-
-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
-            }
+                if(mb_row != pc->mb_rows-1)
+                {
+                    int lasty = pc->yv12_fb[ref_fb_idx].y_width + VP8BORDERINPIXELS;
+                    int lastuv = (pc->yv12_fb[ref_fb_idx].y_width>>1) + (VP8BORDERINPIXELS>>1);
 
-            y_ptr += 16;
-            u_ptr += 8;
-            v_ptr += 8;
+                    for (i = 0; i < 4; i++)
+                    {
+                        pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+                        pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+                        pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+                    }
+                }
+            }else
+                vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
-            mbd->mode_info_context++;     // step to next MB
-            pbi->current_mb_col[mb_row] = mb_col;
+            ++xd->mode_info_context;      /* skip prediction column */
         }
-        mbd->mode_info_context++;         // Skip border mb
-
-        //update for multi-thread
-        y_ptr += post->y_stride  * 16 * (pbi->decoding_thread_count + 1) - post->y_width;
-        u_ptr += post->uv_stride *  8 * (pbi->decoding_thread_count + 1) - post->uv_width;
-        v_ptr += post->uv_stride *  8 * (pbi->decoding_thread_count + 1) - post->uv_width;
-        mbd->mode_info_context += pbi->decoding_thread_count * mbd->mode_info_stride;
+        xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
     }
 
-    sem_wait(&pbi->h_event_end_lpf);
+    sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
 #else
     (void) pbi;
+    (void) xd;
 #endif
 }
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm
index 150d090b6..0d6133a46 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -98,11 +98,11 @@ sym(vp8_dequant_idct_add_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -110,10 +110,10 @@ sym(vp8_dequant_idct_add_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
@@ -153,11 +153,11 @@ sym(vp8_dequant_idct_add_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -165,16 +165,16 @@ sym(vp8_dequant_idct_add_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
-        paddw       mm0,            [fours GLOBAL]
+        paddw       mm0,            [GLOBAL(fours)]
 
-        paddw       mm2,            [fours GLOBAL]
+        paddw       mm2,            [GLOBAL(fours)]
         movq        mm6,            mm2             ; a1
 
         movq        mm4,            mm0             ; b1
@@ -288,7 +288,7 @@ sym(vp8_dequant_dc_idct_add_mmx):
         psrlq       mm0,    16
         movzx       rcx,    word ptr arg(6) ;Dc
         psllq       mm0,    16
-        movd        mm7,    rcx
+        movq        mm7,    rcx
         por         mm0,    mm7
 
         movsxd      rax,            dword ptr arg(4) ;pitch
@@ -300,11 +300,11 @@ sym(vp8_dequant_dc_idct_add_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -312,10 +312,10 @@ sym(vp8_dequant_dc_idct_add_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
@@ -355,11 +355,11 @@ sym(vp8_dequant_dc_idct_add_mmx):
         movq        mm5,            mm1
         paddw       mm2,            mm0             ; a1 =0+2
 
-        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
         paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 
         movq        mm7,            mm3             ;
-        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 
         paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
         psubw       mm7,            mm5             ; c1
@@ -367,16 +367,16 @@ sym(vp8_dequant_dc_idct_add_mmx):
         movq        mm5,            mm1
         movq        mm4,            mm3
 
-        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
         paddw       mm5,            mm1
 
-        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
         paddw       mm3,            mm4
 
         paddw       mm3,            mm5             ; d1
-        paddw       mm0,            [fours GLOBAL]
+        paddw       mm0,            [GLOBAL(fours)]
 
-        paddw       mm2,            [fours GLOBAL]
+        paddw       mm2,            [GLOBAL(fours)]
         movq        mm6,            mm2             ; a1
 
         movq        mm4,            mm0             ; b1
diff --git a/vp8/decoder/xprintf.c b/vp8/decoder/xprintf.c
deleted file mode 100644
index e3b953ef3..000000000
--- a/vp8/decoder/xprintf.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     xprintf.cpp
-*
-*   Description  :     Display a printf style message on the current video frame.
-*
-****************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-
-#include <stdio.h>
-#include <stdarg.h>
-#ifdef _WIN32_WCE
-#include <windows.h>
-#endif
-#include "xprintf.h"
-
-/****************************************************************************
- *
- *  ROUTINE       : xprintf
- *
- *  INPUTS        : const PB_INSTANCE *ppbi : Pointer to decoder instance.
- *                  long n_pixel             : Offset into buffer to write text.
- *                  const char *format      : Format string for print.
- *                  ...                     : Variable length argument list.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : int: Size (in bytes) of the formatted text.
- *
- *  FUNCTION      : Display a printf style message on the current video frame.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-int onyx_xprintf(unsigned char *ppbuffer, long n_pixel, long n_size, long n_stride, const char *format, ...)
-{
-    BOOL b_rc;
-    va_list arglist;
-    HFONT hfont, hfonto;
-
-    int rc = 0;
-    char sz_formatted[256] = "";
-    unsigned char *p_dest = &ppbuffer[n_pixel];
-
-#ifdef _WIN32_WCE
-    //  Set up temporary bitmap
-    HDC hdc_memory   = NULL;
-    HBITMAP hbm_temp = NULL;
-    HBITMAP hbm_orig = NULL;
-
-    RECT rect;
-
-    //  Copy bitmap to video frame
-    long x;
-    long y;
-
-    //  Format text
-    va_start(arglist, format);
-    _vsnprintf(sz_formatted, sizeof(sz_formatted), format, arglist);
-    va_end(arglist);
-
-    rect.left   = 0;
-    rect.top    = 0;
-    rect.right  = 8 * strlen(sz_formatted);
-    rect.bottom = 8;
-
-    hdc_memory = create_compatible_dc(NULL);
-
-    if (hdc_memory == NULL)
-        goto Exit;
-
-    hbm_temp = create_bitmap(rect.right, rect.bottom, 1, 1, NULL);
-
-    if (hbm_temp == NULL)
-        goto Exit;
-
-    hbm_orig = (HBITMAP)(select_object(hdc_memory, hbm_temp));
-
-    if (!hbm_orig)
-        goto Exit;
-
-    //  Write text into bitmap
-    //  font?
-    hfont = create_font(8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, VARIABLE_PITCH | FF_SWISS, "");
-
-    if (hfont == NULL)
-        goto Exit;
-
-    hfonto = (HFONT)(select_object(hdc_memory, hbm_temp));
-
-    if (!hfonto)
-        goto Exit;
-
-    select_object(hdc_memory, hfont);
-    set_text_color(hdc_memory, 1);
-    set_bk_color(hdc_memory, 0);
-    set_bk_mode(hdc_memory, TRANSPARENT);
-
-    b_rc = bit_blt(hdc_memory, rect.left, rect.top, rect.right, rect.bottom, hdc_memory, rect.left, rect.top, BLACKNESS);
-
-    if (!b_rc)
-        goto Exit;
-
-    b_rc = ext_text_out(hdc_memory, 0, 0, ETO_CLIPPED, &rect, sz_formatted, strlen(sz_formatted), NULL);
-
-    if (!b_rc)
-        goto Exit;
-
-    for (y = rect.top; y < rect.bottom; ++y)
-    {
-        for (x = rect.left; x < rect.right; ++x)
-        {
-            if (get_pixel(hdc_memory, x, rect.bottom - 1 - y))
-                p_dest[x] = 255;
-        }
-
-        p_dest += n_stride;
-    }
-
-    rc = strlen(sz_formatted);
-
-Exit:
-
-    if (hbm_temp != NULL)
-    {
-        if (hbm_orig != NULL)
-        {
-            select_object(hdc_memory, hbm_orig);
-        }
-
-        delete_object(hbm_temp);
-    }
-
-    if (hfont != NULL)
-    {
-        if (hfonto != NULL)
-            select_object(hdc_memory, hfonto);
-
-        delete_object(hfont);
-    }
-
-    if (hdc_memory != NULL)
-        delete_dc(hdc_memory);
-
-    hdc_memory = 0;
-
-#endif
-
-    return rc;
-}
diff --git a/vp8/decoder/xprintf.h b/vp8/decoder/xprintf.h
deleted file mode 100644
index f83dd39c6..000000000
--- a/vp8/decoder/xprintf.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     xprintf.h
-*
-*   Description  :     Debug print interface header file.
-*
-****************************************************************************/
-#ifndef __INC_XPRINTF_H
-#define __INC_XPRINTF_H
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-
-/****************************************************************************
-*  Functions
-****************************************************************************/
-
-// Display a printf style message on the current video frame
-extern int onyx_xprintf(unsigned char *ppbuffer, long n_pixel, long n_size, long n_stride, const char *format, ...);
-
-#endif
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
new file mode 100644
index 000000000..a1f110260
--- /dev/null
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -0,0 +1,139 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+
+void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    int flags = cpi->common.rtcd.flags;
+    int has_edsp = flags & HAS_EDSP;
+    int has_media = flags & HAS_MEDIA;
+    int has_neon = flags & HAS_NEON;
+
+#if HAVE_ARMV6
+    if (has_media)
+    {
+        /*cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;*/
+
+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;*/
+
+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
+        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;*/
+
+        /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
+        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
+
+        /*cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
+        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
+        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
+        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/
+
+        /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;*/
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
+
+        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;*/
+
+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;*/
+    }
+#endif
+
+#if HAVE_ARMV7
+    if (has_neon)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_neon;
+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_neon;
+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_neon;
+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_neon;
+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_neon;
+
+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_neon;
+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_neon;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_neon;
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_neon;
+
+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_neon;
+        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_neon;
+        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_neon;
+        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_neon;
+        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_neon;
+
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_neon;
+        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
+
+        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_neon;
+        /*cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
+        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;*/
+        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;
+
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_neon;
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_neon;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_neon;
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_neon;
+
+        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;*/
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_neon;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_neon;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;
+
+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;*/
+        /* The neon quantizer has not been updated to match the new exact
+         * quantizer introduced in commit e04e2935
+         */
+        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;*/
+    }
+#endif
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (has_neon)
+#endif
+    {
+        vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
+    }
+#endif
+#endif
+}
diff --git a/vp8/encoder/arm/neon/boolhuff_armv7.asm b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
index 9c4823c51..e78dc3322 100644
--- a/vp8/encoder/arm/neon/boolhuff_armv7.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -205,17 +205,10 @@ token_count_lt_zero_se
     ldr     r5, [r0, #vp8_writer_range]
     ldr     r3, [r0, #vp8_writer_count]
 
-    ; reverse the stream of bits to be packed.  Normally
-    ; the most significant bit is peeled off and compared
-    ; in the form of (v >> --n) & 1.  ARM architecture has
-    ; the ability to set a flag based on the value of the
-    ; bit shifted off the bottom of the register.  To make
-    ; that happen the bitstream is reversed.
-    rbit    r11, r1
     rsb     r4, r10, #32                 ; 32-n
 
     ; v is kept in r1 during the token pack loop
-    lsr     r1, r11, r4                 ; v >>= 32 - n
+    lsl     r1, r1, r4                  ; r1 = v << 32 - n
 
 encode_value_loop
     sub     r7, r5, #1                  ; range-1
@@ -223,7 +216,7 @@ encode_value_loop
     ; Decisions are made based on the bit value shifted
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
-    lsrs    r1, r1, #1                  ; bit = v >> n
+    lsls    r1, r1, #1                  ; bit = v >> n
     mov     r4, r7, lsl #7              ; ((range-1) * 128)
 
     mov     r7, #1
diff --git a/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
index c19ac8250..3233d2a96 100644
--- a/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT |vp8cx_pack_tokens_armv7|
+    EXPORT |vp8cx_pack_tokens_armv5|
 
     INCLUDE vpx_vp8_enc_asm_offsets.asm
 
@@ -25,7 +25,7 @@
 ; r3 vp8_coef_encodings
 ; s0 vp8_extra_bits
 ; s1 vp8_coef_tree
-|vp8cx_pack_tokens_armv7| PROC
+|vp8cx_pack_tokens_armv5| PROC
     push    {r4-r11, lr}
 
     ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
@@ -57,18 +57,11 @@ while_p_lt_stop
     movne   lr, #2                      ; i = 2
     subne   r8, r8, #1                  ; --n
 
-    ; reverse the stream of bits to be packed.  Normally
-    ; the most significant bit is peeled off and compared
-    ; in the form of (v >> --n) & 1.  ARM architecture has
-    ; the ability to set a flag based on the value of the
-    ; bit shifted off the bottom of the register.  To make
-    ; that happen the bitstream is reversed.
-    rbit    r12, r6
     rsb     r4, r8, #32                 ; 32-n
     ldr     r10, [sp, #52]              ; vp8_coef_tree
 
     ; v is kept in r12 during the token pack loop
-    lsr     r12, r12, r4                ; v >>= 32 - n
+    lsl     r12, r6, r4                ; r12 = v << 32 - n
 
 ; loop start
 token_loop
@@ -78,7 +71,7 @@ token_loop
     ; Decisions are made based on the bit value shifted
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
-    lsrs    r12, r12, #1                ; bb = v >> n
+    lsls    r12, r12, #1                ; bb = v >> n
     mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
@@ -172,16 +165,15 @@ token_count_lt_zero
     ldr     r10, [r12, #vp8_extra_bit_struct_tree]
     str     r10, [sp, #4]               ; b->tree
 
-    rbit    r12, r7                     ; reverse v
     rsb     r4, r8, #32
-    lsr     r12, r12, r4
+    lsl     r12, r7, r4
 
     mov     lr, #0                      ; i = 0
 
 extra_bits_loop
     ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
-    lsrs    r12, r12, #1                ; v >> n
+    lsls    r12, r12, #1                ; v >> n
     mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
diff --git a/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
index 075645586..a9b552ae1 100644
--- a/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT |vp8cx_pack_mb_row_tokens_armv7|
+    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
 
     INCLUDE vpx_vp8_enc_asm_offsets.asm
 
@@ -25,7 +25,7 @@
 ; r3 vp8_extra_bits
 ; s0 vp8_coef_tree
 
-|vp8cx_pack_mb_row_tokens_armv7| PROC
+|vp8cx_pack_mb_row_tokens_armv5| PROC
     push    {r4-r11, lr}
     sub     sp, sp, #24
 
@@ -78,18 +78,11 @@ while_p_lt_stop
     movne   lr, #2                      ; i = 2
     subne   r8, r8, #1                  ; --n
 
-    ; reverse the stream of bits to be packed.  Normally
-    ; the most significant bit is peeled off and compared
-    ; in the form of (v >> --n) & 1.  ARM architecture has
-    ; the ability to set a flag based on the value of the
-    ; bit shifted off the bottom of the register.  To make
-    ; that happen the bitstream is reversed.
-    rbit    r12, r6
     rsb     r4, r8, #32                 ; 32-n
     ldr     r10, [sp, #60]              ; vp8_coef_tree
 
     ; v is kept in r12 during the token pack loop
-    lsr     r12, r12, r4                ; v >>= 32 - n
+    lsl     r12, r6, r4                 ; r12 = v << 32 - n
 
 ; loop start
 token_loop
@@ -99,7 +92,7 @@ token_loop
     ; Decisions are made based on the bit value shifted
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
-    lsrs    r12, r12, #1                ; bb = v >> n
+    lsls    r12, r12, #1                ; bb = v >> n
     mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
@@ -193,16 +186,15 @@ token_count_lt_zero
     ldr     r10, [r12, #vp8_extra_bit_struct_tree]
     str     r10, [sp, #4]               ; b->tree
 
-    rbit    r12, r7                     ; reverse v
     rsb     r4, r8, #32
-    lsr     r12, r12, r4
+    lsl     r12, r7, r4
 
     mov     lr, #0                      ; i = 0
 
 extra_bits_loop
     ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
-    lsrs    r12, r12, #1                ; v >> n
+    lsls    r12, r12, #1                ; v >> n
     mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
diff --git a/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index 10a3d9851..0835164e5 100644
--- a/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT |vp8cx_pack_tokens_into_partitions_armv7|
+    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
 
     INCLUDE vpx_vp8_enc_asm_offsets.asm
 
@@ -27,7 +27,7 @@
 ; s1 vp8_extra_bits,
 ; s2 const vp8_tree_index *,
 
-|vp8cx_pack_tokens_into_partitions_armv7| PROC
+|vp8cx_pack_tokens_into_partitions_armv5| PROC
     push    {r4-r11, lr}
     sub     sp, sp, #44
 
@@ -106,18 +106,11 @@ while_p_lt_stop
     movne   lr, #2                      ; i = 2
     subne   r8, r8, #1                  ; --n
 
-    ; reverse the stream of bits to be packed.  Normally
-    ; the most significant bit is peeled off and compared
-    ; in the form of (v >> --n) & 1.  ARM architecture has
-    ; the ability to set a flag based on the value of the
-    ; bit shifted off the bottom of the register.  To make
-    ; that happen the bitstream is reversed.
-    rbit    r12, r6
     rsb     r4, r8, #32                 ; 32-n
     ldr     r10, [sp, #88]              ; vp8_coef_tree
 
     ; v is kept in r12 during the token pack loop
-    lsr     r12, r12, r4                ; v >>= 32 - n
+    lsl     r12, r6, r4                ; r12 = v << 32 - n
 
 ; loop start
 token_loop
@@ -127,7 +120,7 @@ token_loop
     ; Decisions are made based on the bit value shifted
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
-    lsrs    r12, r12, #1                ; bb = v >> n
+    lsls    r12, r12, #1                ; bb = v >> n
     mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
@@ -221,16 +214,15 @@ token_count_lt_zero
     ldr     r10, [r12, #vp8_extra_bit_struct_tree]
     str     r10, [sp, #4]               ; b->tree
 
-    rbit    r12, r7                     ; reverse v
     rsb     r4, r8, #32
-    lsr     r12, r12, r4
+    lsl     r12, r7, r4
 
     mov     lr, #0                      ; i = 0
 
 extra_bits_loop
     ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
-    lsrs    r12, r12, #1                ; v >> n
+    lsls    r12, r12, #1                ; v >> n
     mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
diff --git a/vp8/encoder/arm/csystemdependent.c b/vp8/encoder/arm/csystemdependent.c
deleted file mode 100644
index 8d70d635a..000000000
--- a/vp8/encoder/arm/csystemdependent.c
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "variance.h"
-#include "onyx_int.h"
-
-void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-
-void vp8_cmachine_specific_config(VP8_COMP *cpi)
-{
-#if CONFIG_RUNTIME_CPU_DETECT
-    cpi->rtcd.common                         = &cpi->common.rtcd;
-
-#if HAVE_ARMV7
-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_neon;
-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_neon;
-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_neon;
-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_neon;
-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_neon;
-
-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_neon;
-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_neon;
-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_neon;
-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_neon;
-
-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_neon;
-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_neon;
-
-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_neon;
-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
-
-    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_neon;
-    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
-    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
-    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;
-
-    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;
-    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_neon;
-    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_neon;
-    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_neon;
-    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_neon;
-
-    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
-    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
-    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
-    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_neon;
-    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_neon;
-    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;
-
-    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-    /* The neon quantizer has not been updated to match the new exact
-     * quantizer introduced in commit e04e2935
-     */
-    /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;*/
-#elif HAVE_ARMV6
-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
-
-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
-
-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
-
-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
-
-    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
-    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
-    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
-    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;
-
-    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
-    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;
-    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
-
-    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
-    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
-    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
-    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
-    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
-    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;
-
-    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-#else
-    //pure c
-    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
-    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
-    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
-    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
-    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
-
-    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
-    cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
-    cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-    cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
-    cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
-
-    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
-    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
-    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
-
-    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
-    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
-
-    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
-    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
-    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
-    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;
-
-    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
-    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;
-    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
-
-    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
-    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
-    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
-    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
-    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
-    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;
-
-    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-#endif
-#endif
-
-#if HAVE_ARMV7
-    vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
-#else
-    vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
-#endif
-}
diff --git a/vp8/encoder/arm/dct_arm.h b/vp8/encoder/arm/dct_arm.h
index 774599bf0..41fa5d192 100644
--- a/vp8/encoder/arm/dct_arm.h
+++ b/vp8/encoder/arm/dct_arm.h
@@ -15,9 +15,11 @@
 #if HAVE_ARMV6
 extern prototype_fdct(vp8_short_walsh4x4_armv6);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
 #endif
+#endif
 
 #if HAVE_ARMV7
 extern prototype_fdct(vp8_short_fdct4x4_neon);
@@ -26,6 +28,7 @@ extern prototype_fdct(vp8_fast_fdct4x4_neon);
 extern prototype_fdct(vp8_fast_fdct8x4_neon);
 extern prototype_fdct(vp8_short_walsh4x4_neon);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_neon
 
@@ -40,6 +43,7 @@ extern prototype_fdct(vp8_short_walsh4x4_neon);
 
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
+#endif
 
 #endif
 
diff --git a/vp8/encoder/arm/encodemb_arm.h b/vp8/encoder/arm/encodemb_arm.h
index eb699433f..8fe453735 100644
--- a/vp8/encoder/arm/encodemb_arm.h
+++ b/vp8/encoder/arm/encodemb_arm.h
@@ -30,6 +30,7 @@ extern prototype_submbuv(vp8_subtract_mbuv_neon);
 //#undef  vp8_encodemb_mbuverr
 //#define vp8_encodemb_mbuverr vp8_mbuverror_c
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_encodemb_subb
 #define vp8_encodemb_subb vp8_subtract_b_neon
 
@@ -38,6 +39,7 @@ extern prototype_submbuv(vp8_subtract_mbuv_neon);
 
 #undef  vp8_encodemb_submbuv
 #define vp8_encodemb_submbuv vp8_subtract_mbuv_neon
+#endif
 
 #endif
 
diff --git a/vp8/encoder/arm/mcomp_arm.c b/vp8/encoder/arm/mcomp_arm.c
deleted file mode 100644
index 4e95c47ac..000000000
--- a/vp8/encoder/arm/mcomp_arm.c
+++ /dev/null
@@ -1,1663 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "mcomp.h"
-#include "vpx_mem/vpx_mem.h"
-
-#include <stdio.h>
-#include <limits.h>
-#include <math.h>
-
-#ifdef ENTROPY_STATS
-static int mv_ref_ct [31] [4] [2];
-static int mv_mode_cts [4] [2];
-#endif
-
-static int mv_bits_sadcost[256];
-
-extern unsigned int vp8_sub_pixel_variance16x16s_neon
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-extern unsigned int vp8_sub_pixel_variance16x16s_4_0_neon
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-extern unsigned int vp8_sub_pixel_variance16x16s_0_4_neon
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-extern unsigned int vp8_sub_pixel_variance16x16s_4_4_neon
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-
-void vp8cx_init_mv_bits_sadcost()
-{
-    int i;
-
-    for (i = 0; i < 256; i++)
-    {
-        mv_bits_sadcost[i] = (int)sqrt(i * 16);
-    }
-}
-
-
-int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight)
-{
-    // MV costing is based on the distribution of vectors in the previous frame and as such will tend to
-    // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the
-    // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks.
-    // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors.
-    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7;
-}
-
-int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
-{
-    //int i;
-    //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8;
-    //return ( (vp8_mv_bit_cost(mv,  ref, mvcost, 100) + 128) * error_per_bit) >> 8;
-
-    //i = (vp8_mv_bit_cost(mv,  ref, mvcost, 100) * error_per_bit + 128) >> 8;
-    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * error_per_bit + 128) >> 8;
-    //return (vp8_mv_bit_cost(mv,  ref, mvcost, 128) * error_per_bit + 128) >> 8;
-}
-
-
-static int mv_bits(MV *mv, MV *ref, int *mvcost[2])
-{
-    // get the estimated number of bits for a motion vector, to be used for costing in SAD based
-    // motion estimation
-    return ((mvcost[0][(mv->row - ref->row) >> 1]  +  mvcost[1][(mv->col - ref->col)>> 1]) + 128) >> 8;
-}
-
-void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
-{
-    int Len;
-    int search_site_count = 0;
-
-
-    // Generate offsets for 4 search sites per step.
-    Len = MAX_FIRST_STEP;
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = 0;
-    search_site_count++;
-
-    while (Len > 0)
-    {
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = 0;
-        x->ss[search_site_count].mv.row = -Len;
-        x->ss[search_site_count].offset = -Len * stride;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = 0;
-        x->ss[search_site_count].mv.row = Len;
-        x->ss[search_site_count].offset = Len * stride;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = -Len;
-        x->ss[search_site_count].mv.row = 0;
-        x->ss[search_site_count].offset = -Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = Len;
-        x->ss[search_site_count].mv.row = 0;
-        x->ss[search_site_count].offset = Len;
-        search_site_count++;
-
-        // Contract.
-        Len /= 2;
-    }
-
-    x->ss_count = search_site_count;
-    x->searches_per_step = 4;
-}
-
-void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
-{
-    int Len;
-    int search_site_count = 0;
-
-    // Generate offsets for 8 search sites per step.
-    Len = MAX_FIRST_STEP;
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = 0;
-    search_site_count++;
-
-    while (Len > 0)
-    {
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = 0;
-        x->ss[search_site_count].mv.row = -Len;
-        x->ss[search_site_count].offset = -Len * stride;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = 0;
-        x->ss[search_site_count].mv.row = Len;
-        x->ss[search_site_count].offset = Len * stride;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = -Len;
-        x->ss[search_site_count].mv.row = 0;
-        x->ss[search_site_count].offset = -Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = Len;
-        x->ss[search_site_count].mv.row = 0;
-        x->ss[search_site_count].offset = Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = -Len;
-        x->ss[search_site_count].mv.row = -Len;
-        x->ss[search_site_count].offset = -Len * stride - Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = Len;
-        x->ss[search_site_count].mv.row = -Len;
-        x->ss[search_site_count].offset = -Len * stride + Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = -Len;
-        x->ss[search_site_count].mv.row = Len;
-        x->ss[search_site_count].offset = Len * stride - Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = Len;
-        x->ss[search_site_count].mv.row = Len;
-        x->ss[search_site_count].offset = Len * stride + Len;
-        search_site_count++;
-
-
-        // Contract.
-        Len /= 2;
-    }
-
-    x->ss_count = search_site_count;
-    x->searches_per_step = 8;
-}
-
-
-#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
-#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
-#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
-#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
-#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
-#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
-
-//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
-
-int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
-{
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
-    unsigned char *z = (*(b->base_src) + b->src);
-
-    int rr = ref_mv->row >> 1, rc = ref_mv->col >> 1;
-    int br = bestmv->row << 2, bc = bestmv->col << 2;
-    int tr = br, tc = bc;
-    unsigned int besterr = INT_MAX;
-    unsigned int left, right, up, down, diag;
-    unsigned int sse;
-    unsigned int whichdir;
-    unsigned int halfiters = 4;
-    unsigned int quarteriters = 4;
-
-    int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1));
-    int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1));
-    int minr = MAX(x->mv_row_min << 2, (ref_mv->row >> 1) - ((1 << mvlong_width) - 1));
-    int maxr = MIN(x->mv_row_max << 2, (ref_mv->row >> 1) + ((1 << mvlong_width) - 1));
-
-    // central mv
-    bestmv->row <<= 3;
-    bestmv->col <<= 3;
-
-    // calculate central point error
-    besterr = vf(y, d->pre_stride, z, b->src_stride, &sse);
-    besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
-
-    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
-    while (--halfiters)
-    {
-        // 1/2 pel
-        CHECK_BETTER(left, tr, tc - 2);
-        CHECK_BETTER(right, tr, tc + 2);
-        CHECK_BETTER(up, tr - 2, tc);
-        CHECK_BETTER(down, tr + 2, tc);
-
-        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-        switch (whichdir)
-        {
-        case 0:
-            CHECK_BETTER(diag, tr - 2, tc - 2);
-            break;
-        case 1:
-            CHECK_BETTER(diag, tr - 2, tc + 2);
-            break;
-        case 2:
-            CHECK_BETTER(diag, tr + 2, tc - 2);
-            break;
-        case 3:
-            CHECK_BETTER(diag, tr + 2, tc + 2);
-            break;
-        }
-
-        // no reason to check the same one again.
-        if (tr == br && tc == bc)
-            break;
-
-        tr = br;
-        tc = bc;
-    }
-
-    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
-    // 1/4 pel
-    while (--quarteriters)
-    {
-        CHECK_BETTER(left, tr, tc - 1);
-        CHECK_BETTER(right, tr, tc + 1);
-        CHECK_BETTER(up, tr - 1, tc);
-        CHECK_BETTER(down, tr + 1, tc);
-
-        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-        switch (whichdir)
-        {
-        case 0:
-            CHECK_BETTER(diag, tr - 1, tc - 1);
-            break;
-        case 1:
-            CHECK_BETTER(diag, tr - 1, tc + 1);
-            break;
-        case 2:
-            CHECK_BETTER(diag, tr + 1, tc - 1);
-            break;
-        case 3:
-            CHECK_BETTER(diag, tr + 1, tc + 1);
-            break;
-        }
-
-        // no reason to check the same one again.
-        if (tr == br && tc == bc)
-            break;
-
-        tr = br;
-        tc = bc;
-    }
-
-    bestmv->row = br << 1;
-    bestmv->col = bc << 1;
-
-    if ((abs(bestmv->col - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs(bestmv->row - ref_mv->row) > MAX_FULL_PEL_VAL))
-        return INT_MAX;
-
-    return besterr;
-}
-#undef MVC
-#undef PRE
-#undef SP
-#undef DIST
-#undef ERR
-#undef CHECK_BETTER
-#undef MIN
-#undef MAX
-int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
-{
-    int bestmse = INT_MAX;
-    MV startmv;
-    //MV this_mv;
-    MV this_mv;
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
-    unsigned char *z = (*(b->base_src) + b->src);
-    int left, right, up, down, diag;
-    unsigned int sse;
-    int whichdir ;
-
-
-    // Trap uncodable vectors
-    if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
-    {
-        bestmv->row <<= 3;
-        bestmv->col <<= 3;
-        return INT_MAX;
-    }
-
-    // central mv
-    bestmv->row <<= 3;
-    bestmv->col <<= 3;
-    startmv = *bestmv;
-
-    // calculate central point error
-    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
-    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
-
-    // go left then right and check error
-    this_mv.row = startmv.row;
-    this_mv.col = ((startmv.col - 8) | 4);
-    left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (left < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = left;
-    }
-
-    this_mv.col += 8;
-    right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (right < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = right;
-    }
-
-    // go up then down and check error
-    this_mv.col = startmv.col;
-    this_mv.row = ((startmv.row - 8) | 4);
-    up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (up < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = up;
-    }
-
-    this_mv.row += 8;
-    down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (down < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = down;
-    }
-
-
-    // now check 1 more diagonal
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-    //for(whichdir =0;whichdir<4;whichdir++)
-    //{
-    this_mv = startmv;
-
-    switch (whichdir)
-    {
-    case 0:
-        this_mv.col = (this_mv.col - 8) | 4;
-        this_mv.row = (this_mv.row - 8) | 4;
-        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-        break;
-    case 1:
-        this_mv.col += 4;
-        this_mv.row = (this_mv.row - 8) | 4;
-        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-        break;
-    case 2:
-        this_mv.col = (this_mv.col - 8) | 4;
-        this_mv.row += 4;
-        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
-        break;
-    case 3:
-        this_mv.col += 4;
-        this_mv.row += 4;
-        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
-        break;
-    }
-
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-//  }
-
-
-    // time to check quarter pels.
-    if (bestmv->row < startmv.row)
-        y -= d->pre_stride;
-
-    if (bestmv->col < startmv.col)
-        y--;
-
-    startmv = *bestmv;
-
-
-
-    // go left then right and check error
-    this_mv.row = startmv.row;
-
-    if (startmv.col & 7)
-    {
-        this_mv.col = startmv.col - 2;
-        left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    }
-    else
-    {
-        this_mv.col = (startmv.col - 8) | 6;
-        left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
-    }
-
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (left < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = left;
-    }
-
-    this_mv.col += 4;
-    right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (right < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = right;
-    }
-
-    // go up then down and check error
-    this_mv.col = startmv.col;
-
-    if (startmv.row & 7)
-    {
-        this_mv.row = startmv.row - 2;
-        up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    }
-    else
-    {
-        this_mv.row = (startmv.row - 8) | 6;
-        up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
-    }
-
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (up < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = up;
-    }
-
-    this_mv.row += 4;
-    down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (down < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = down;
-    }
-
-
-    // now check 1 more diagonal
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-//  for(whichdir=0;whichdir<4;whichdir++)
-//  {
-    this_mv = startmv;
-
-    switch (whichdir)
-    {
-    case 0:
-
-        if (startmv.row & 7)
-        {
-            this_mv.row -= 2;
-
-            if (startmv.col & 7)
-            {
-                this_mv.col -= 2;
-                diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-            }
-            else
-            {
-                this_mv.col = (startmv.col - 8) | 6;
-                diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
-            }
-        }
-        else
-        {
-            this_mv.row = (startmv.row - 8) | 6;
-
-            if (startmv.col & 7)
-            {
-                this_mv.col -= 2;
-                diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
-            }
-            else
-            {
-                this_mv.col = (startmv.col - 8) | 6;
-                diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
-            }
-        }
-
-        break;
-    case 1:
-        this_mv.col += 2;
-
-        if (startmv.row & 7)
-        {
-            this_mv.row -= 2;
-            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-        }
-        else
-        {
-            this_mv.row = (startmv.row - 8) | 6;
-            diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
-        }
-
-        break;
-    case 2:
-        this_mv.row += 2;
-
-        if (startmv.col & 7)
-        {
-            this_mv.col -= 2;
-            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-        }
-        else
-        {
-            this_mv.col = (startmv.col - 8) | 6;
-            diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
-        }
-
-        break;
-    case 3:
-        this_mv.col += 2;
-        this_mv.row += 2;
-        diag = svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-        break;
-    }
-
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-//  }
-
-    return bestmse;
-}
-
-int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
-{
-    int bestmse = INT_MAX;
-    MV startmv;
-    //MV this_mv;
-    MV this_mv;
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
-    unsigned char *z = (*(b->base_src) + b->src);
-    int left, right, up, down, diag;
-    unsigned int sse;
-
-    // Trap uncodable vectors
-    if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
-    {
-        bestmv->row <<= 3;
-        bestmv->col <<= 3;
-        return INT_MAX;
-    }
-
-    // central mv
-    bestmv->row <<= 3;
-    bestmv->col <<= 3;
-    startmv = *bestmv;
-
-    // calculate central point error
-    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
-    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
-
-    // go left then right and check error
-    this_mv.row = startmv.row;
-    this_mv.col = ((startmv.col - 8) | 4);
-    left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (left < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = left;
-    }
-
-    this_mv.col += 8;
-    right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (right < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = right;
-    }
-
-    // go up then down and check error
-    this_mv.col = startmv.col;
-    this_mv.row = ((startmv.row - 8) | 4);
-    up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (up < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = up;
-    }
-
-    this_mv.row += 8;
-    down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (down < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = down;
-    }
-
-    // somewhat strangely not doing all the diagonals for half pel is slower than doing them.
-#if 0
-    // now check 1 more diagonal -
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-    this_mv = startmv;
-
-    switch (whichdir)
-    {
-    case 0:
-        this_mv.col = (this_mv.col - 8) | 4;
-        this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
-        break;
-    case 1:
-        this_mv.col += 4;
-        this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
-        break;
-    case 2:
-        this_mv.col = (this_mv.col - 8) | 4;
-        this_mv.row += 4;
-        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
-        break;
-    case 3:
-        this_mv.col += 4;
-        this_mv.row += 4;
-        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
-        break;
-    }
-
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-#else
-    this_mv.col = (this_mv.col - 8) | 4;
-    this_mv.row = (this_mv.row - 8) | 4;
-    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-    this_mv.col += 8;
-    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-    this_mv.col = (this_mv.col - 8) | 4;
-    this_mv.row = startmv.row + 4;
-    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-    this_mv.col += 8;
-    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-#endif
-    return bestmse;
-}
-
-#if 1
-
-#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
-#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
-#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
-#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
-const MV next_chkpts[6][3] =
-{
-    {{ -2, 0}, { -1, -2}, {1, -2}},
-    {{ -1, -2}, {1, -2}, {2, 0}},
-    {{1, -2}, {2, 0}, {1, 2}},
-    {{2, 0}, {1, 2}, { -1, 2}},
-    {{1, 2}, { -1, 2}, { -2, 0}},
-    {{ -1, 2}, { -2, 0}, { -1, -2}}
-};
-int vp8_hex_search
-(
-    MACROBLOCK *x,
-    BLOCK *b,
-    BLOCKD *d,
-    MV *ref_mv,
-    MV *best_mv,
-    int search_param,
-    int error_per_bit,
-    int *num00,
-    vp8_variance_fn_t vf,
-    vp8_sad_fn_t      sf,
-    int *mvsadcost[2],
-    int *mvcost[2]
-)
-{
-    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
-    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
-    int i, j;
-    unsigned char *src = (*(b->base_src) + b->src);
-    int src_stride = b->src_stride;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
-    unsigned int besterr, thiserr = 0x7fffffff;
-    int k = -1, tk;
-
-    if (bc < x->mv_col_min) bc = x->mv_col_min;
-
-    if (bc > x->mv_col_max) bc = x->mv_col_max;
-
-    if (br < x->mv_row_min) br = x->mv_row_min;
-
-    if (br > x->mv_row_max) br = x->mv_row_max;
-
-    rr >>= 1;
-    rc >>= 1;
-
-    besterr = ERR(br, bc, thiserr);
-
-    // hex search
-    //j=0
-    tr = br;
-    tc = bc;
-
-    for (i = 0; i < 6; i++)
-    {
-        int nr = tr + hex[i].row, nc = tc + hex[i].col;
-
-        if (nc < x->mv_col_min) continue;
-
-        if (nc > x->mv_col_max) continue;
-
-        if (nr < x->mv_row_min) continue;
-
-        if (nr > x->mv_row_max) continue;
-
-        //CHECK_BETTER(thiserr,nr,nc);
-        if ((thiserr = ERR(nr, nc, besterr)) < besterr)
-        {
-            besterr = thiserr;
-            br = nr;
-            bc = nc;
-            k = i;
-        }
-    }
-
-    if (tr == br && tc == bc)
-        goto cal_neighbors;
-
-    for (j = 1; j < 127; j++)
-    {
-        tr = br;
-        tc = bc;
-        tk = k;
-
-        for (i = 0; i < 3; i++)
-        {
-            int nr = tr + next_chkpts[tk][i].row, nc = tc + next_chkpts[tk][i].col;
-
-            if (nc < x->mv_col_min) continue;
-
-            if (nc > x->mv_col_max) continue;
-
-            if (nr < x->mv_row_min) continue;
-
-            if (nr > x->mv_row_max) continue;
-
-            //CHECK_BETTER(thiserr,nr,nc);
-            if ((thiserr = ERR(nr, nc, besterr)) < besterr)
-            {
-                besterr = thiserr;
-                br = nr;
-                bc = nc; //k=(tk+5+i)%6;}
-                k = tk + 5 + i;
-
-                if (k >= 12) k -= 12;
-                else if (k >= 6) k -= 6;
-            }
-        }
-
-        if (tr == br && tc == bc)
-            break;
-    }
-
-    // check 8 1 away neighbors
-cal_neighbors:
-    tr = br;
-    tc = bc;
-
-    for (i = 0; i < 8; i++)
-    {
-        int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
-
-        if (nc < x->mv_col_min) continue;
-
-        if (nc > x->mv_col_max) continue;
-
-        if (nr < x->mv_row_min) continue;
-
-        if (nr > x->mv_row_max) continue;
-
-        CHECK_BETTER(thiserr, nr, nc);
-    }
-
-    best_mv->row = br;
-    best_mv->col = bc;
-
-    return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
-}
-#undef MVC
-#undef PRE
-#undef SP
-#undef DIST
-#undef ERR
-#undef CHECK_BETTER
-
-#else
-
-#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
-#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
-#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
-#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
-
-int vp8_hex_search
-(
-    MACROBLOCK *x,
-    BLOCK *b,
-    BLOCKD *d,
-    MV *ref_mv,
-    MV *best_mv,
-    int search_param,
-    int error_per_bit,
-    int *num00,
-    vp8_variance_fn_t vf,
-    vp8_sad_fn_t      sf,
-    int *mvsadcost[2],
-    int *mvcost[2]
-)
-{
-    MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ;
-    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
-    int i, j;
-    unsigned char *src = (*(b->base_src) + b->src);
-    int src_stride = b->src_stride;
-    //int rr= ref_mv->row,rc= ref_mv->col,br=rr,bc=rc,tr,tc;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
-    unsigned int besterr, thiserr = 0x7fffffff;
-
-    /*
-        if ( rc < x->mv_col_min) bc = x->mv_col_min;
-        if ( rc > x->mv_col_max) bc = x->mv_col_max;
-        if ( rr < x->mv_row_min) br = x->mv_row_min;
-        if ( rr > x->mv_row_max) br = x->mv_row_max;
-        rr>>=1;
-        rc>>=1;
-        br>>=3;
-        bc>>=3;
-    */
-    if (bc < x->mv_col_min) bc = x->mv_col_min;
-
-    if (bc > x->mv_col_max) bc = x->mv_col_max;
-
-    if (br < x->mv_row_min) br = x->mv_row_min;
-
-    if (br > x->mv_row_max) br = x->mv_row_max;
-
-    rr >>= 1;
-    rc >>= 1;
-
-    besterr = ERR(br, bc, thiserr);
-
-    // hex search  jbb changed to 127 to avoid max 256 problem steping by 2.
-    for (j = 0; j < 127; j++)
-    {
-        tr = br;
-        tc = bc;
-
-        for (i = 0; i < 6; i++)
-        {
-            int nr = tr + hex[i].row, nc = tc + hex[i].col;
-
-            if (nc < x->mv_col_min) continue;
-
-            if (nc > x->mv_col_max) continue;
-
-            if (nr < x->mv_row_min) continue;
-
-            if (nr > x->mv_row_max) continue;
-
-            CHECK_BETTER(thiserr, nr, nc);
-        }
-
-        if (tr == br && tc == bc)
-            break;
-    }
-
-    // check 8 1 away neighbors
-    tr = br;
-    tc = bc;
-
-    for (i = 0; i < 8; i++)
-    {
-        int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
-
-        if (nc < x->mv_col_min) continue;
-
-        if (nc > x->mv_col_max) continue;
-
-        if (nr < x->mv_row_min) continue;
-
-        if (nr > x->mv_row_max) continue;
-
-        CHECK_BETTER(thiserr, nr, nc);
-    }
-
-    best_mv->row = br;
-    best_mv->col = bc;
-
-    return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
-}
-#undef MVC
-#undef PRE
-#undef SP
-#undef DIST
-#undef ERR
-#undef CHECK_BETTER
-
-#endif
-
-int vp8_diamond_search_sad
-(
-    MACROBLOCK *x,
-    BLOCK *b,
-    BLOCKD *d,
-    MV *ref_mv,
-    MV *best_mv,
-    int search_param,
-    int error_per_bit,
-    int *num00,
-    vp8_variance_fn_ptr_t *fn_ptr,
-    int *mvsadcost[2],
-    int *mvcost[2]
-)
-{
-    int i, j, step;
-
-    unsigned char *what = (*(b->base_src) + b->src);
-    int what_stride = b->src_stride;
-    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    unsigned char *best_address;
-
-    int tot_steps;
-    MV this_mv;
-
-    int bestsad = INT_MAX;
-    int best_site = 0;
-    int last_site = 0;
-
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
-    int this_row_offset;
-    int this_col_offset;
-    search_site *ss;
-
-    unsigned char *check_here;
-    int thissad;
-
-    // Work out the start point for the search
-    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
-    best_address = in_what;
-
-    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
-    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
-    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
-    {
-        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
-    }
-
-    // search_param determines the length of the initial step and hence the number of iterations
-    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
-    ss = &x->ss[search_param * x->searches_per_step];
-    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
-
-    i = 1;
-    best_mv->row = ref_row;
-    best_mv->col = ref_col;
-
-    *num00 = 0;
-
-    for (step = 0; step < tot_steps ; step++)
-    {
-        for (j = 0 ; j < x->searches_per_step ; j++)
-        {
-            // Trap illegal vectors
-            this_row_offset = best_mv->row + ss[i].mv.row;
-            this_col_offset = best_mv->col + ss[i].mv.col;
-
-            if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
-
-            {
-                check_here = ss[i].offset + best_address;
-                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
-
-                if (thissad < bestsad)
-                {
-                    this_mv.row = this_row_offset << 3;
-                    this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                    if (thissad < bestsad)
-                    {
-                        bestsad = thissad;
-                        best_site = i;
-                    }
-                }
-            }
-
-            i++;
-        }
-
-        if (best_site != last_site)
-        {
-            best_mv->row += ss[best_site].mv.row;
-            best_mv->col += ss[best_site].mv.col;
-            best_address += ss[best_site].offset;
-            last_site = best_site;
-        }
-        else if (best_address == in_what)
-            (*num00)++;
-    }
-
-    this_mv.row = best_mv->row << 3;
-    this_mv.col = best_mv->col << 3;
-
-    if (bestsad == INT_MAX)
-        return INT_MAX;
-
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-}
-
-int vp8_diamond_search_sadx4
-(
-    MACROBLOCK *x,
-    BLOCK *b,
-    BLOCKD *d,
-    MV *ref_mv,
-    MV *best_mv,
-    int search_param,
-    int error_per_bit,
-    int *num00,
-    vp8_variance_fn_ptr_t *fn_ptr,
-    int *mvsadcost[2],
-    int *mvcost[2]
-)
-{
-    int i, j, step;
-
-    unsigned char *what = (*(b->base_src) + b->src);
-    int what_stride = b->src_stride;
-    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    unsigned char *best_address;
-
-    int tot_steps;
-    MV this_mv;
-
-    int bestsad = INT_MAX;
-    int best_site = 0;
-    int last_site = 0;
-
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
-    int this_row_offset;
-    int this_col_offset;
-    search_site *ss;
-
-    unsigned char *check_here;
-    int thissad;
-
-    // Work out the start point for the search
-    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
-    best_address = in_what;
-
-    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
-    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
-    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
-    {
-        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
-    }
-
-    // search_param determines the length of the initial step and hence the number of iterations
-    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
-    ss = &x->ss[search_param * x->searches_per_step];
-    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
-
-    i = 1;
-    best_mv->row = ref_row;
-    best_mv->col = ref_col;
-
-    *num00 = 0;
-
-    for (step = 0; step < tot_steps ; step++)
-    {
-        int check_row_min, check_col_min, check_row_max, check_col_max;
-
-        check_row_min = x->mv_row_min - best_mv->row;
-        check_row_max = x->mv_row_max - best_mv->row;
-        check_col_min = x->mv_col_min - best_mv->col;
-        check_col_max = x->mv_col_max - best_mv->col;
-
-        for (j = 0 ; j < x->searches_per_step ; j += 4)
-        {
-            char *block_offset[4];
-            unsigned int valid_block[4];
-            int all_in = 1, t;
-
-            for (t = 0; t < 4; t++)
-            {
-                valid_block [t]  = (ss[t+i].mv.col > check_col_min);
-                valid_block [t] &= (ss[t+i].mv.col < check_col_max);
-                valid_block [t] &= (ss[t+i].mv.row > check_row_min);
-                valid_block [t] &= (ss[t+i].mv.row < check_row_max);
-
-                all_in &= valid_block[t];
-                block_offset[t] = ss[i+t].offset + best_address;
-            }
-
-            if (all_in)
-            {
-                int sad_array[4];
-
-                fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
-
-                for (t = 0; t < 4; t++, i++)
-                {
-                    thissad = sad_array[t];
-
-                    if (thissad < bestsad)
-                    {
-                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
-                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                        if (thissad < bestsad)
-                        {
-                            bestsad = thissad;
-                            best_site = i;
-                        }
-                    }
-                }
-            }
-            else
-            {
-                int t;
-
-                for (t = 0; t < 4; i++, t++)
-                {
-                    // Trap illegal vectors
-                    if (valid_block[t])
-
-                    {
-                        check_here = block_offset[t];
-                        thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
-
-                        if (thissad < bestsad)
-                        {
-                            this_row_offset = best_mv->row + ss[i].mv.row;
-                            this_col_offset = best_mv->col + ss[i].mv.col;
-
-                            this_mv.row = this_row_offset << 3;
-                            this_mv.col = this_col_offset << 3;
-                            thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                            if (thissad < bestsad)
-                            {
-                                bestsad = thissad;
-                                best_site = i;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        if (best_site != last_site)
-        {
-            best_mv->row += ss[best_site].mv.row;
-            best_mv->col += ss[best_site].mv.col;
-            best_address += ss[best_site].offset;
-            last_site = best_site;
-        }
-        else if (best_address == in_what)
-            (*num00)++;
-    }
-
-    this_mv.row = best_mv->row << 3;
-    this_mv.col = best_mv->col << 3;
-
-    if (bestsad == INT_MAX)
-        return INT_MAX;
-
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-}
-
-
-#if !(CONFIG_REALTIME_ONLY)
-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
-{
-    unsigned char *what = (*(b->base_src) + b->src);
-    int what_stride = b->src_stride;
-    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    int mv_stride = d->pre_stride;
-    unsigned char *bestaddress;
-    MV *best_mv = &d->bmi.mv.as_mv;
-    MV this_mv;
-    int bestsad = INT_MAX;
-    int r, c;
-
-    unsigned char *check_here;
-    int thissad;
-
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
-
-    int row_min = ref_row - distance;
-    int row_max = ref_row + distance;
-    int col_min = ref_col - distance;
-    int col_max = ref_col + distance;
-
-    // Work out the mid point for the search
-    in_what = *(d->base_pre) + d->pre;
-    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
-    best_mv->row = ref_row;
-    best_mv->col = ref_col;
-
-    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
-    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
-    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
-    {
-        // Baseline value at the centre
-
-        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
-    }
-
-    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-    if (col_min < x->mv_col_min)
-        col_min = x->mv_col_min;
-
-    if (col_max > x->mv_col_max)
-        col_max = x->mv_col_max;
-
-    if (row_min < x->mv_row_min)
-        row_min = x->mv_row_min;
-
-    if (row_max > x->mv_row_max)
-        row_max = x->mv_row_max;
-
-    for (r = row_min; r < row_max ; r++)
-    {
-        this_mv.row = r << 3;
-        check_here = r * mv_stride + in_what + col_min;
-
-        for (c = col_min; c < col_max; c++)
-        {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
-
-            this_mv.col = c << 3;
-            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
-            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
-
-            if (thissad < bestsad)
-            {
-                bestsad = thissad;
-                best_mv->row = r;
-                best_mv->col = c;
-                bestaddress = check_here;
-            }
-
-            check_here++;
-        }
-    }
-
-    this_mv.row = best_mv->row << 3;
-    this_mv.col = best_mv->col << 3;
-
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-    else
-        return INT_MAX;
-}
-
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
-{
-    unsigned char *what = (*(b->base_src) + b->src);
-    int what_stride = b->src_stride;
-    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    int mv_stride = d->pre_stride;
-    unsigned char *bestaddress;
-    MV *best_mv = &d->bmi.mv.as_mv;
-    MV this_mv;
-    int bestsad = INT_MAX;
-    int r, c;
-
-    unsigned char *check_here;
-    int thissad;
-
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
-
-    int row_min = ref_row - distance;
-    int row_max = ref_row + distance;
-    int col_min = ref_col - distance;
-    int col_max = ref_col + distance;
-
-    int sad_array[3];
-
-    // Work out the mid point for the search
-    in_what = *(d->base_pre) + d->pre;
-    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
-    best_mv->row = ref_row;
-    best_mv->col = ref_col;
-
-    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
-    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
-    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
-    {
-        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
-    }
-
-    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-    if (col_min < x->mv_col_min)
-        col_min = x->mv_col_min;
-
-    if (col_max > x->mv_col_max)
-        col_max = x->mv_col_max;
-
-    if (row_min < x->mv_row_min)
-        row_min = x->mv_row_min;
-
-    if (row_max > x->mv_row_max)
-        row_max = x->mv_row_max;
-
-    for (r = row_min; r < row_max ; r++)
-    {
-        this_mv.row = r << 3;
-        check_here = r * mv_stride + in_what + col_min;
-        c = col_min;
-
-        while ((c + 3) < col_max)
-        {
-            int i;
-
-            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
-
-            for (i = 0; i < 3; i++)
-            {
-                thissad = sad_array[i];
-
-                if (thissad < bestsad)
-                {
-                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                    if (thissad < bestsad)
-                    {
-                        bestsad = thissad;
-                        best_mv->row = r;
-                        best_mv->col = c;
-                        bestaddress = check_here;
-                    }
-                }
-
-                check_here++;
-                c++;
-            }
-        }
-
-        while (c < col_max)
-        {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
-
-            if (thissad < bestsad)
-            {
-                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                if (thissad < bestsad)
-                {
-                    bestsad = thissad;
-                    best_mv->row = r;
-                    best_mv->col = c;
-                    bestaddress = check_here;
-                }
-            }
-
-            check_here ++;
-            c ++;
-        }
-
-    }
-
-    this_mv.row = best_mv->row << 3;
-    this_mv.col = best_mv->col << 3;
-
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-    else
-        return INT_MAX;
-}
-#endif
-
-#ifdef ENTROPY_STATS
-void print_mode_context(void)
-{
-    FILE *f = fopen("modecont.c", "w");
-    int i, j;
-
-    fprintf(f, "#include \"entropy.h\"\n");
-    fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
-    fprintf(f, "{\n");
-
-    for (j = 0; j < 6; j++)
-    {
-        fprintf(f, "  { // %d \n", j);
-        fprintf(f, "    ");
-
-        for (i = 0; i < 4; i++)
-        {
-            int overal_prob;
-            int this_prob;
-            int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1];
-
-            // Overall probs
-            count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
-
-            if (count)
-                overal_prob = 256 * mv_mode_cts[i][0] / count;
-            else
-                overal_prob = 128;
-
-            if (overal_prob == 0)
-                overal_prob = 1;
-
-            // context probs
-            count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-
-            if (count)
-                this_prob = 256 * mv_ref_ct[j][i][0] / count;
-            else
-                this_prob = 128;
-
-            if (this_prob == 0)
-                this_prob = 1;
-
-            fprintf(f, "%5d, ", this_prob);
-            //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob);
-            //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob);
-        }
-
-        fprintf(f, "  },\n");
-    }
-
-    fprintf(f, "};\n");
-    fclose(f);
-}
-
-/* MV ref count ENTROPY_STATS stats code */
-#ifdef ENTROPY_STATS
-void init_mv_ref_counts()
-{
-    vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
-    vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
-}
-
-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
-{
-    if (m == ZEROMV)
-    {
-        ++mv_ref_ct [ct[0]] [0] [0];
-        ++mv_mode_cts[0][0];
-    }
-    else
-    {
-        ++mv_ref_ct [ct[0]] [0] [1];
-        ++mv_mode_cts[0][1];
-
-        if (m == NEARESTMV)
-        {
-            ++mv_ref_ct [ct[1]] [1] [0];
-            ++mv_mode_cts[1][0];
-        }
-        else
-        {
-            ++mv_ref_ct [ct[1]] [1] [1];
-            ++mv_mode_cts[1][1];
-
-            if (m == NEARMV)
-            {
-                ++mv_ref_ct [ct[2]] [2] [0];
-                ++mv_mode_cts[2][0];
-            }
-            else
-            {
-                ++mv_ref_ct [ct[2]] [2] [1];
-                ++mv_mode_cts[2][1];
-
-                if (m == NEWMV)
-                {
-                    ++mv_ref_ct [ct[3]] [3] [0];
-                    ++mv_mode_cts[3][0];
-                }
-                else
-                {
-                    ++mv_ref_ct [ct[3]] [3] [1];
-                    ++mv_mode_cts[3][1];
-                }
-            }
-        }
-    }
-}
-
-#endif/* END MV ref count ENTROPY_STATS stats code */
-
-#endif
diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
index 1c1441cc2..0a2b71c49 100644
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -9,9 +9,9 @@
 ;
 
 
-    EXPORT  |vp8_sub_pixel_variance16x16s_4_0_neon|
-    EXPORT  |vp8_sub_pixel_variance16x16s_0_4_neon|
-    EXPORT  |vp8_sub_pixel_variance16x16s_4_4_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_h_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_v_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_neon|
     EXPORT  |vp8_sub_pixel_variance16x16s_neon|
     ARM
     REQUIRE8
@@ -20,7 +20,7 @@
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
 ;================================================
-;unsigned int vp8_sub_pixel_variance16x16s_4_0_neon
+;unsigned int vp8_variance_halfpixvar16x16_h_neon
 ;(
 ;    unsigned char  *src_ptr, r0
 ;    int  src_pixels_per_line,  r1
@@ -29,7 +29,7 @@
 ;    unsigned int *sse
 ;);
 ;================================================
-|vp8_sub_pixel_variance16x16s_4_0_neon| PROC
+|vp8_variance_halfpixvar16x16_h_neon| PROC
     push            {lr}
 
     mov             r12, #4                  ;loop counter
@@ -120,7 +120,7 @@ vp8_filt_fpo16x16s_4_0_loop_neon
     ENDP
 
 ;================================================
-;unsigned int vp8_sub_pixel_variance16x16s_0_4_neon
+;unsigned int vp8_variance_halfpixvar16x16_v_neon
 ;(
 ;    unsigned char  *src_ptr, r0
 ;    int  src_pixels_per_line,  r1
@@ -129,7 +129,7 @@ vp8_filt_fpo16x16s_4_0_loop_neon
 ;    unsigned int *sse
 ;);
 ;================================================
-|vp8_sub_pixel_variance16x16s_0_4_neon| PROC
+|vp8_variance_halfpixvar16x16_v_neon| PROC
     push            {lr}
 
     mov             r12, #4                     ;loop counter
@@ -216,7 +216,7 @@ vp8_filt_spo16x16s_0_4_loop_neon
     ENDP
 
 ;================================================
-;unsigned int vp8_sub_pixel_variance16x16s_4_4_neon
+;unsigned int vp8_variance_halfpixvar16x16_hv_neon
 ;(
 ;    unsigned char  *src_ptr, r0
 ;    int  src_pixels_per_line,  r1
@@ -225,7 +225,7 @@ vp8_filt_spo16x16s_0_4_loop_neon
 ;    unsigned int *sse
 ;);
 ;================================================
-|vp8_sub_pixel_variance16x16s_4_4_neon| PROC
+|vp8_variance_halfpixvar16x16_hv_neon| PROC
     push            {lr}
 
     vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c
index 50f58bf08..65c616614 100644
--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -29,7 +29,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor
 
 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
 {
-    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, &b->zbin[0][0], d->qcoeff, d->dqcoeff, d->dequant[0], vp8_rvsplus1_default_zig_zag1d, &b->round[0][0], &b->quant[0][0]);
+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant);
 }
 
 /*
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
index 859e43f51..0e5f62fcf 100644
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -30,6 +30,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_neon);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_neon);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
 
 //extern prototype_getmbss(vp8_get_mb_ss_c);
 extern prototype_variance(vp8_mse16x16_neon);
@@ -38,6 +41,7 @@ extern prototype_sad(vp8_get16x16pred_error_neon);
 //extern prototype_variance2(vp8_get16x16var_c);
 extern prototype_sad(vp8_get4x4sse_cs_neon);
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_variance_sad4x4
 #define vp8_variance_sad4x4 vp8_sad4x4_neon
 
@@ -83,6 +87,15 @@ extern prototype_sad(vp8_get4x4sse_cs_neon);
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_neon
 
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_neon
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_neon
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_neon
+
 //#undef  vp8_variance_getmbss
 //#define vp8_variance_getmbss vp8_get_mb_ss_c
 
@@ -100,6 +113,7 @@ extern prototype_sad(vp8_get4x4sse_cs_neon);
 
 #undef  vp8_variance_get4x4sse_cs
 #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon
+#endif
 
 #endif
 
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 929c17841..412542d10 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1490,9 +1490,11 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     if (xd->mode_ref_lf_delta_enabled)
     {
         // Do the deltas need to be updated
-        vp8_write_bit(bc, (xd->mode_ref_lf_delta_update) ? 1 : 0);
+        int send_update = xd->mode_ref_lf_delta_update
+                          || cpi->oxcf.error_resilient_mode;
 
-        if (xd->mode_ref_lf_delta_update)
+        vp8_write_bit(bc, send_update);
+        if (send_update)
         {
             int Data;
 
@@ -1502,8 +1504,10 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
                 Data = xd->ref_lf_deltas[i];
 
                 // Frame level data
-                if (Data)
+                if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]
+                    || cpi->oxcf.error_resilient_mode)
                 {
+                    xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
                     vp8_write_bit(bc, 1);
 
                     if (Data > 0)
@@ -1527,8 +1531,10 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
             {
                 Data = xd->mode_lf_deltas[i];
 
-                if (Data)
+                if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]
+                    || cpi->oxcf.error_resilient_mode)
                 {
+                    xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
                     vp8_write_bit(bc, 1);
 
                     if (Data > 0)
diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h
index 559631338..f5d148ea4 100644
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -12,25 +12,25 @@
 #ifndef __INC_BITSTREAM_H
 #define __INC_BITSTREAM_H
 
-#if HAVE_ARMV7
-void vp8cx_pack_tokens_armv7(vp8_writer *w, const TOKENEXTRA *p, int xcount,
+#if HAVE_ARMV5TE
+void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
                              vp8_token *,
                              vp8_extra_bit_struct *,
                              const vp8_tree_index *);
-void vp8cx_pack_tokens_into_partitions_armv7(VP8_COMP *, unsigned char *, int , int *,
+void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *, unsigned char *, int , int *,
         vp8_token *,
         vp8_extra_bit_struct *,
         const vp8_tree_index *);
-void vp8cx_pack_mb_row_tokens_armv7(VP8_COMP *cpi, vp8_writer *w,
+void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
                                     vp8_token *,
                                     vp8_extra_bit_struct *,
                                     const vp8_tree_index *);
 # define pack_tokens(a,b,c)                  \
-    vp8cx_pack_tokens_armv7(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+    vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 # define pack_tokens_into_partitions(a,b,c,d)  \
-    vp8cx_pack_tokens_into_partitions_armv7(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+    vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 # define pack_mb_row_tokens(a,b)               \
-    vp8cx_pack_mb_row_tokens_armv7(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+    vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 #else
 # define pack_tokens(a,b,c)                  pack_tokens_c(a,b,c)
 # define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index ffb88904e..e94e54976 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -32,11 +32,11 @@ typedef struct
     short *coeff;
 
     // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-    short(*quant)[4];
-    short(*quant_shift)[4];
-    short(*zbin)[4];
-    short(*zrun_zbin_boost);
-    short(*round)[4];
+    short *quant;
+    short *quant_shift;
+    short *zbin;
+    short *zrun_zbin_boost;
+    short *round;
 
     // Zbin Over Quant value
     short zbin_extra;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index d8a76d5b5..85e121be3 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -160,7 +160,6 @@ static void vp8cx_invert_quant(short *quant, short *shift, short d)
 
 void vp8cx_init_quantizer(VP8_COMP *cpi)
 {
-    int r, c;
     int i;
     int quant_val;
     int Q;
@@ -171,58 +170,56 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
     {
         // dc values
         quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        vp8cx_invert_quant(cpi->Y1quant[Q][0] + 0,
-                           cpi->Y1quant_shift[Q][0] + 0, quant_val);
-        cpi->Y1zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-        cpi->Y1round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
-        cpi->common.Y1dequant[Q][0][0] = quant_val;
+        vp8cx_invert_quant(cpi->Y1quant[Q] + 0,
+                           cpi->Y1quant_shift[Q] + 0, quant_val);
+        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][0] = quant_val;
         cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        vp8cx_invert_quant(cpi->Y2quant[Q][0] + 0,
-                           cpi->Y2quant_shift[Q][0] + 0, quant_val);
-        cpi->Y2zbin[Q][0][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-        cpi->Y2round[Q][0][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-        cpi->common.Y2dequant[Q][0][0] = quant_val;
+        vp8cx_invert_quant(cpi->Y2quant[Q] + 0,
+                           cpi->Y2quant_shift[Q] + 0, quant_val);
+        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][0] = quant_val;
         cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        vp8cx_invert_quant(cpi->UVquant[Q][0] + 0,
-                           cpi->UVquant_shift[Q][0] + 0, quant_val);
-        cpi->UVzbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
-        cpi->UVround[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
-        cpi->common.UVdequant[Q][0][0] = quant_val;
+        vp8cx_invert_quant(cpi->UVquant[Q] + 0,
+                           cpi->UVquant_shift[Q] + 0, quant_val);
+        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][0] = quant_val;
         cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         // all the ac values = ;
         for (i = 1; i < 16; i++)
         {
             int rc = vp8_default_zig_zag1d[i];
-            r = (rc >> 2);
-            c = (rc & 3);
 
             quant_val = vp8_ac_yquant(Q);
-            vp8cx_invert_quant(cpi->Y1quant[Q][r] + c,
-                               cpi->Y1quant_shift[Q][r] + c, quant_val);
-            cpi->Y1zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->Y1round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.Y1dequant[Q][r][c] = quant_val;
+            vp8cx_invert_quant(cpi->Y1quant[Q] + rc,
+                               cpi->Y1quant_shift[Q] + rc, quant_val);
+            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.Y1dequant[Q][rc] = quant_val;
             cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            vp8cx_invert_quant(cpi->Y2quant[Q][r] + c,
-                               cpi->Y2quant_shift[Q][r] + c, quant_val);
-            cpi->Y2zbin[Q][r][c] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-            cpi->Y2round[Q][r][c] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-            cpi->common.Y2dequant[Q][r][c] = quant_val;
+            vp8cx_invert_quant(cpi->Y2quant[Q] + rc,
+                               cpi->Y2quant_shift[Q] + rc, quant_val);
+            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+            cpi->common.Y2dequant[Q][rc] = quant_val;
             cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            vp8cx_invert_quant(cpi->UVquant[Q][r] + c,
-                               cpi->UVquant_shift[Q][r] + c, quant_val);
-            cpi->UVzbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->UVround[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.UVdequant[Q][r][c] = quant_val;
+            vp8cx_invert_quant(cpi->UVquant[Q] + rc,
+                               cpi->UVquant_shift[Q] + rc, quant_val);
+            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.UVdequant[Q][rc] = quant_val;
             cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
         }
     }
@@ -230,7 +227,6 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
 #else
 void vp8cx_init_quantizer(VP8_COMP *cpi)
 {
-    int r, c;
     int i;
     int quant_val;
     int Q;
@@ -241,52 +237,50 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
     {
         // dc values
         quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        cpi->Y1quant[Q][0][0] = (1 << 16) / quant_val;
-        cpi->Y1zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-        cpi->Y1round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
-        cpi->common.Y1dequant[Q][0][0] = quant_val;
+        cpi->Y1quant[Q][0] = (1 << 16) / quant_val;
+        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][0] = quant_val;
         cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        cpi->Y2quant[Q][0][0] = (1 << 16) / quant_val;
-        cpi->Y2zbin[Q][0][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-        cpi->Y2round[Q][0][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-        cpi->common.Y2dequant[Q][0][0] = quant_val;
+        cpi->Y2quant[Q][0] = (1 << 16) / quant_val;
+        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][0] = quant_val;
         cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        cpi->UVquant[Q][0][0] = (1 << 16) / quant_val;
-        cpi->UVzbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
-        cpi->UVround[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
-        cpi->common.UVdequant[Q][0][0] = quant_val;
+        cpi->UVquant[Q][0] = (1 << 16) / quant_val;
+        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][0] = quant_val;
         cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         // all the ac values = ;
         for (i = 1; i < 16; i++)
         {
             int rc = vp8_default_zig_zag1d[i];
-            r = (rc >> 2);
-            c = (rc & 3);
 
             quant_val = vp8_ac_yquant(Q);
-            cpi->Y1quant[Q][r][c] = (1 << 16) / quant_val;
-            cpi->Y1zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->Y1round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.Y1dequant[Q][r][c] = quant_val;
+            cpi->Y1quant[Q][rc] = (1 << 16) / quant_val;
+            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.Y1dequant[Q][rc] = quant_val;
             cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            cpi->Y2quant[Q][r][c] = (1 << 16) / quant_val;
-            cpi->Y2zbin[Q][r][c] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-            cpi->Y2round[Q][r][c] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-            cpi->common.Y2dequant[Q][r][c] = quant_val;
+            cpi->Y2quant[Q][rc] = (1 << 16) / quant_val;
+            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+            cpi->common.Y2dequant[Q][rc] = quant_val;
             cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            cpi->UVquant[Q][r][c] = (1 << 16) / quant_val;
-            cpi->UVzbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->UVround[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.UVdequant[Q][r][c] = quant_val;
+            cpi->UVquant[Q][rc] = (1 << 16) / quant_val;
+            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.UVdequant[Q][rc] = quant_val;
             cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
         }
     }
@@ -317,7 +311,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
         QIndex = cpi->common.base_qindex;
 
     // Y
-    zbin_extra = (cpi->common.Y1dequant[QIndex][0][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+    zbin_extra = (cpi->common.Y1dequant[QIndex][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
 
     for (i = 0; i < 16; i++)
     {
@@ -331,7 +325,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
     }
 
     // UV
-    zbin_extra = (cpi->common.UVdequant[QIndex][0][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+    zbin_extra = (cpi->common.UVdequant[QIndex][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
 
     for (i = 16; i < 24; i++)
     {
@@ -345,7 +339,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
     }
 
     // Y2
-    zbin_extra = (cpi->common.Y2dequant[QIndex][0][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
+    zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
     x->block[24].quant = cpi->Y2quant[QIndex];
     x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
     x->block[24].zbin = cpi->Y2zbin[QIndex];
@@ -400,21 +394,31 @@ void encode_mb_row(VP8_COMP *cpi,
     cpi->tplist[mb_row].start = *tp;
     //printf("Main mb_row = %d\n", mb_row);
 
+    // Distance of Mb to the top & bottom edges, specified in 1/8th pel
+    // units as they are always compared to values that are in 1/8th pel units
+    xd->mb_to_top_edge = -((mb_row * 16) << 3);
+    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+    // Set up limit values for vertical motion vector components
+    // to prevent them extending beyond the UMV borders
+    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) 
+                        + (VP8BORDERINPIXELS - 16);
+
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
     {
-        // Distance of Mb to the various image edges.
-        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+        // Distance of Mb to the left & right edges, specified in 
+        // 1/8th pel units as they are always compared to values 
+        // that are in 1/8th pel units
         xd->mb_to_left_edge = -((mb_col * 16) << 3);
         xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-        xd->mb_to_top_edge = -((mb_row * 16) << 3);
-        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
 
-        // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
+        // Set up limit values for horizontal motion vector components
+        // to prevent them extending beyond the UMV borders
         x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
-        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
-        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) 
+                            + (VP8BORDERINPIXELS - 16);
 
         xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
         xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
@@ -545,31 +549,29 @@ void vp8_encode_frame(VP8_COMP *cpi)
     int segment_counts[MAX_MB_SEGMENTS];
     int totalrate;
 
-    if (cm->frame_type != KEY_FRAME)
+    // Functions setup for all frame types so we can use MC in AltRef
+    if (cm->mcomp_filter_type == SIXTAP)
     {
-        if (cm->mcomp_filter_type == SIXTAP)
-        {
-            xd->subpixel_predict     = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap4x4);
-            xd->subpixel_predict8x4      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x4);
-            xd->subpixel_predict8x8      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x8);
-            xd->subpixel_predict16x16    = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap16x16);
-        }
-        else
-        {
-            xd->subpixel_predict     = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear4x4);
-            xd->subpixel_predict8x4      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x4);
-            xd->subpixel_predict8x8      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x8);
-            xd->subpixel_predict16x16    = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear16x16);
-        }
+        xd->subpixel_predict        = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap4x4);
+        xd->subpixel_predict8x4     = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap8x4);
+        xd->subpixel_predict8x8     = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap8x8);
+        xd->subpixel_predict16x16   = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap16x16);
+    }
+    else
+    {
+        xd->subpixel_predict        = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, bilinear4x4);
+        xd->subpixel_predict8x4     = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, bilinear8x4);
+        xd->subpixel_predict8x8     = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, bilinear8x8);
+        xd->subpixel_predict16x16   = SUBPIX_INVOKE(
+                                      &cpi->common.rtcd.subpix, bilinear16x16);
     }
-
-    //else  // Key Frame
-    //{
-    // For key frames make sure the intra ref frame probability value
-    // is set to "all intra"
-    //cpi->prob_intra_coded = 255;
-    //}
-
 
     x->gf_active_ptr = (signed char *)cpi->gf_active_flags;     // Point to base of GF active flags data structure
 
@@ -1063,8 +1065,6 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 
         error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
 
-        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
-
         vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
         rate += rateuv;
 
@@ -1131,8 +1131,6 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
         else
             Error4x4 = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
 
-        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
-
         if (Error4x4 < Error16x16)
         {
             x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
@@ -1229,8 +1227,6 @@ int vp8cx_encode_inter_macroblock
 
     if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     {
-        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
-
         vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
 
         if (xd->mode_info_context->mbmi.mode == B_PRED)
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index af80857d2..1c72b90f1 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -53,8 +53,6 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK
 
     x->quantize_b(be, b);
 
-    x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &= (!b->eob);
-
     vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
 
     RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
@@ -70,8 +68,6 @@ void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BL
 
     x->quantize_b(be, b);
 
-    x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &= (!b->eob);
-
     IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32);
 
     RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
@@ -109,7 +105,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
 #if !(CONFIG_REALTIME_ONLY)
 #if 1
-    if (x->optimize && x->rddiv > 1)
+    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
         vp8_optimize_mby(x, rtcd);
 
 #endif
@@ -117,7 +113,8 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
-    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, recon_mby)
+        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 
     // make sure block modes are set the way we want them for context updates
     for (b = 0; b < 16; b++)
@@ -157,13 +154,12 @@ void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     vp8_transform_intra_mby(x);
 
-    x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = 1;
-
     vp8_quantize_mby(x);
 
     vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
-    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, recon_mby)
+        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 
     // make sure block modes are set the way we want them for context updates
     for (b = 0; b < 16; b++)
@@ -206,7 +202,7 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 #if !(CONFIG_REALTIME_ONLY)
 #if 1
 
-    if (x->optimize && x->rddiv > 1)
+    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
         vp8_optimize_mbuv(x, rtcd);
 
 #endif
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index e10b5159a..043eac219 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -242,7 +242,20 @@ struct vp8_token_state{
   short         qc;
 };
 
-void vp8_optimize_b(MACROBLOCK *mb, int i, int type,
+// TODO: experiments to find optimal multiple numbers
+#define Y1_RD_MULT 1
+#define UV_RD_MULT 1
+#define Y2_RD_MULT 4
+
+static const int plane_rd_mult[4]=
+{
+    Y1_RD_MULT,
+    Y2_RD_MULT,
+    UV_RD_MULT,
+    Y1_RD_MULT
+};
+
+void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                     const VP8_ENCODER_RTCD *rtcd)
 {
@@ -275,9 +288,11 @@ void vp8_optimize_b(MACROBLOCK *mb, int i, int type,
     int best;
     int band;
     int pt;
+    int i;
+    int err_mult = plane_rd_mult[type];
 
-    b = &mb->block[i];
-    d = &mb->e_mbd.block[i];
+    b = &mb->block[ib];
+    d = &mb->e_mbd.block[ib];
 
     /* Enable this to test the effect of RDO as a replacement for the dynamic
      *  zero bin instead of an augmentation of it.
@@ -286,8 +301,8 @@ void vp8_optimize_b(MACROBLOCK *mb, int i, int type,
     vp8_strict_quantize_b(b, d);
 #endif
 
-    dequant_ptr = &d->dequant[0][0];
-    coeff_ptr = &b->coeff[0];
+    dequant_ptr = d->dequant;
+    coeff_ptr = b->coeff;
     qcoeff_ptr = d->qcoeff;
     dqcoeff_ptr = d->dqcoeff;
     i0 = !type;
@@ -295,7 +310,7 @@ void vp8_optimize_b(MACROBLOCK *mb, int i, int type,
 
     /* Now set up a Viterbi trellis to evaluate alternative roundings. */
     /* TODO: These should vary with the block type, since the quantizer does. */
-    rdmult = mb->rdmult << 2;
+    rdmult = (mb->rdmult << 2)*err_mult;
     rddiv = mb->rddiv;
     best_mask[0] = best_mask[1] = 0;
     /* Initialize the sentinel node of the trellis. */
@@ -523,42 +538,11 @@ void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
     }
 
 
-    /*
     if (has_2nd_order)
     {
-        vp8_setup_temp_context(&t, x->e_mbd.above_context[Y2CONTEXT],
-            x->e_mbd.left_context[Y2CONTEXT], 1);
-        vp8_optimize_b(x, 24, 1, t.a, t.l, rtcd);
-    }
-    */
-}
-
-
-
-static void vp8_find_mb_skip_coef(MACROBLOCK *x)
-{
-    int i;
-
-    x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = 1;
-
-    if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
-    {
-        for (i = 0; i < 16; i++)
-        {
-            x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
-        }
-
-        for (i = 16; i < 25; i++)
-        {
-            x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
-        }
-    }
-    else
-    {
-        for (i = 0; i < 24; i++)
-        {
-            x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
-        }
+        b=24;
+        vp8_optimize_b(x, b, vp8_block2type[b],
+            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
 }
 
@@ -595,14 +579,13 @@ void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
         ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
 
-    /*
+
     if (has_2nd_order)
     {
-        vp8_setup_temp_context(&t, x->e_mbd.above_context[Y2CONTEXT],
-            x->e_mbd.left_context[Y2CONTEXT], 1);
-        vp8_optimize_b(x, 24, 1, t.a, t.l, rtcd);
+        b=24;
+        vp8_optimize_b(x, b, vp8_block2type[b],
+            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
-    */
 }
 
 void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
@@ -650,16 +633,14 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
     vp8_quantize_mb(x);
 
 #if !(CONFIG_REALTIME_ONLY)
-    if (x->optimize && x->rddiv > 1)
-    {
+    if (x->optimize==2 ||(x->optimize && x->rddiv > 1))
         vp8_optimize_mb(x, rtcd);
-        vp8_find_mb_skip_coef(x);
-    }
 #endif
 
     vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
-    vp8_recon16x16mb(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, recon_mb)
+        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
 
 
@@ -676,7 +657,8 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 
     vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
-    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, recon_mby)
+        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
 
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index a65bce6e1..8a94fa369 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -30,7 +30,6 @@
 #include "encodemv.h"
 
 //#define OUTPUT_FPF 1
-//#define FIRSTPASS_MM 1
 
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x) (x)
@@ -108,15 +107,6 @@ static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position)
 
 static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 {
-    /*FIRSTPASS_STATS * start_pos;
-    int ret_val;
-
-    start_pos = cpi->stats_in;
-    ret_val = vp8_input_stats(cpi, next_frame);
-    reset_fpf_position(cpi, start_pos);
-
-    return ret_val;*/
-
     if (cpi->stats_in >= cpi->stats_in_end)
         return EOF;
 
@@ -127,7 +117,7 @@ static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    double av_err = cpi->total_stats.ssim_weighted_pred_err;
+    double av_err = cpi->total_stats->ssim_weighted_pred_err;
     double this_err = this_frame->ssim_weighted_pred_err;
     double modified_err;
 
@@ -238,7 +228,7 @@ int frame_max_bits(VP8_COMP *cpi)
     else
     {
         // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
-        max_bits = (int)(((double)cpi->bits_left / (cpi->total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+        max_bits = (int)(((double)cpi->bits_left / (cpi->total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
     }
 
     // Trap case where we are out of bits
@@ -248,17 +238,35 @@ int frame_max_bits(VP8_COMP *cpi)
     return max_bits;
 }
 
-void vp8_output_stats(struct vpx_codec_pkt_list *pktlist,
+
+extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
+{
+    /* Calculate the size of a stats packet, which is dependent on the frame
+     * resolution. The FIRSTPASS_STATS struct has a single element array,
+     * motion_map, which is virtually expanded to have one element per
+     * macroblock.
+     */
+    size_t stats_sz;
+    FIRSTPASS_STATS stats;
+
+    stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
+    stats_sz = (stats_sz + 7) & ~7;
+    return stats_sz;
+}
+
+
+void vp8_output_stats(const VP8_COMP            *cpi,
+                      struct vpx_codec_pkt_list *pktlist,
                       FIRSTPASS_STATS            *stats)
 {
     struct vpx_codec_cx_pkt pkt;
     pkt.kind = VPX_CODEC_STATS_PKT;
     pkt.data.twopass_stats.buf = stats;
-    pkt.data.twopass_stats.sz = sizeof(*stats);
+    pkt.data.twopass_stats.sz = vp8_firstpass_stats_sz(cpi->common.MBs);
     vpx_codec_pkt_list_add(pktlist, &pkt);
 
 // TEMP debug code
-#ifdef OUTPUT_FPF
+#if OUTPUT_FPF
     {
         FILE *fpfile;
         fpfile = fopen("firstpass.stt", "a");
@@ -280,16 +288,24 @@ void vp8_output_stats(struct vpx_codec_pkt_list *pktlist,
                 stats->mv_in_out_count,
                 stats->count);
         fclose(fpfile);
+
+
+        fpfile = fopen("fpmotionmap.stt", "a");
+        if(fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, fpfile));
+        fclose(fpfile);
     }
 #endif
 }
 
 int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
 {
+    size_t stats_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+
     if (cpi->stats_in >= cpi->stats_in_end)
         return EOF;
 
-    *fps = *cpi->stats_in++;
+    *fps = *cpi->stats_in;
+    cpi->stats_in = (void*)((char *)cpi->stats_in + stats_sz);
     return 1;
 }
 
@@ -352,76 +368,47 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)
     section->duration   /= section->count;
 }
 
-int vp8_fpmm_get_pos(VP8_COMP *cpi)
+unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
 {
-    return ftell(cpi->fp_motion_mapfile);
+    return cpi->fp_motion_map_stats;
 }
-void vp8_fpmm_reset_pos(VP8_COMP *cpi, int target_pos)
+void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
 {
     int Offset;
 
-    if (cpi->fp_motion_mapfile)
-    {
-        Offset = ftell(cpi->fp_motion_mapfile) - target_pos;
-        fseek(cpi->fp_motion_mapfile, (int) - Offset, SEEK_CUR);
-    }
+    cpi->fp_motion_map_stats = target_pos;
 }
 
 void vp8_advance_fpmm(VP8_COMP *cpi, int count)
 {
-#ifdef FIRSTPASS_MM
-    fseek(cpi->fp_motion_mapfile, (int)(count * cpi->common.MBs), SEEK_CUR);
-#endif
+    cpi->fp_motion_map_stats = (void*)((char*)cpi->fp_motion_map_stats +
+        count * vp8_firstpass_stats_sz(cpi->common.MBs));
 }
 
-void vp8_input_fpmm(VP8_COMP *cpi, int count)
+void vp8_input_fpmm(VP8_COMP *cpi)
 {
-#ifdef FIRSTPASS_MM
-
-    unsigned char *tmp_motion_map;
-    int i, j;
-
-    if (!cpi->fp_motion_mapfile)
-        return;                 // Error
-
-    // Create the first pass motion map structure and set to 0
-    CHECK_MEM_ERROR(tmp_motion_map, vpx_calloc(cpi->common.MBs, 1));
-
-    // Reset the state of the global map
-    vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs);
+    unsigned char *fpmm = cpi->fp_motion_map;
+    int MBs = cpi->common.MBs;
+    int max_frames = cpi->active_arnr_frames;
+    int i;
 
-    // Read the specified number of frame maps and set the global map to the highest value seen for each mb.
-    for (i = 0; i < count; i++)
+    for (i=0; i<max_frames; i++)
     {
-        if (fread(tmp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile) == cpi->common.MBs)
-        {
-            for (j = 0; j < cpi->common.MBs; j++)
-            {
-                if (tmp_motion_map[j] > 1)
-                    cpi->fp_motion_map[j] += 5;   // Intra is flagged
-                else
-                    cpi->fp_motion_map[j] += tmp_motion_map[j];
-            }
-        }
-        else
-            break;  // Read error
+        char *motion_map = (char*)cpi->fp_motion_map_stats
+                           + sizeof(FIRSTPASS_STATS);
 
+        memcpy(fpmm, motion_map, MBs);
+        fpmm += MBs;
+        vp8_advance_fpmm(cpi, 1);
     }
 
-    if (tmp_motion_map != 0)
-        vpx_free(tmp_motion_map);
-
-#endif
-
+    // Flag the use of weights in the temporal filter
+    cpi->use_weighted_temporal_filter = 1;
 }
 
 void vp8_init_first_pass(VP8_COMP *cpi)
 {
-    vp8_zero_stats(&cpi->total_stats);
-
-#ifdef FIRSTPASS_MM
-    cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "wb");
-#endif
+    vp8_zero_stats(cpi->total_stats);
 
 // TEMP debug code
 #ifdef OUTPUT_FPF
@@ -429,6 +416,8 @@ void vp8_init_first_pass(VP8_COMP *cpi)
         FILE *fpfile;
         fpfile = fopen("firstpass.stt", "w");
         fclose(fpfile);
+        fpfile = fopen("fpmotionmap.stt", "wb");
+        fclose(fpfile);
     }
 #endif
 
@@ -436,16 +425,10 @@ void vp8_init_first_pass(VP8_COMP *cpi)
 
 void vp8_end_first_pass(VP8_COMP *cpi)
 {
-    vp8_output_stats(cpi->output_pkt_list, &cpi->total_stats);
-
-#ifdef FIRSTPASS_MM
-
-    if (cpi->fp_motion_mapfile)
-        fclose(cpi->fp_motion_mapfile);
+    vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
+}
 
-#endif
 
-}
 void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
 {
     MACROBLOCKD * const xd = & x->e_mbd;
@@ -479,12 +462,11 @@ void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *
     int step_param = 3;                                       //3;          // Dont search over full range for first pass
     int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; //3;
     int n;
-    vp8_variance_fn_ptr_t v_fn_ptr;
+    vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
     int new_mv_mode_penalty = 256;
 
+    // override the default variance function to use MSE
     v_fn_ptr.vf    = VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16);
-    v_fn_ptr.sdf   = cpi->fn_ptr.sdf;
-    v_fn_ptr.sdx4df = cpi->fn_ptr.sdx4df;
 
     // Set up pointers for this macro block recon buffer
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
@@ -603,6 +585,8 @@ void vp8_first_pass(VP8_COMP *cpi)
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
             int this_error;
+            int zero_error;
+            int zz_to_best_ratio;
             int gf_motion_error = INT_MAX;
             int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
@@ -624,7 +608,7 @@ void vp8_first_pass(VP8_COMP *cpi)
             intra_error += this_error;
 
             // Indicate default assumption of intra in the motion map
-            *fp_motion_map_ptr = 2;
+            *fp_motion_map_ptr = 0;
 
             // Set up limit values for motion vectors to prevent them extending outside the UMV borders
             x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
@@ -646,6 +630,9 @@ void vp8_first_pass(VP8_COMP *cpi)
                 d->bmi.mv.as_mv.row = 0;
                 d->bmi.mv.as_mv.col = 0;
 
+                // Save (0,0) error for later use
+                zero_error = motion_error;
+
                 // Test last reference frame using the previous best mv as the
                 // starting point (best reference) for the search
                 vp8_first_pass_motion_search(cpi, x, &best_ref_mv,
@@ -719,8 +706,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                     {
                         mvcount++;
 
-                        *fp_motion_map_ptr = 1;
-
                         // Does the Row vector point inwards or outwards
                         if (mb_row < cm->mb_rows / 2)
                         {
@@ -752,12 +737,30 @@ void vp8_first_pass(VP8_COMP *cpi)
                             else if (d->bmi.mv.as_mv.col < 0)
                                 sum_in_vectors--;
                         }
+
+                        // Compute how close (0,0) predictor is to best
+                        // predictor in terms of their prediction error
+                        zz_to_best_ratio = (10*zero_error + this_error/2)
+                                            / (this_error+!this_error);
+
+                        if ((zero_error < 50000) &&
+                            (zz_to_best_ratio <= 11) )
+                            *fp_motion_map_ptr = 1;
+                        else
+                            *fp_motion_map_ptr = 0;
                     }
                     else
-                        *fp_motion_map_ptr = 0;    // 0,0 mv was best
+                    {
+                        // 0,0 mv was best
+                        if( zero_error<50000 )
+                            *fp_motion_map_ptr = 2;
+                        else
+                            *fp_motion_map_ptr = 1;
+                    }
                 }
                 else
                 {
+                    // Intra was best
                     best_ref_mv.row = 0;
                     best_ref_mv.col = 0;
                 }
@@ -835,19 +838,20 @@ void vp8_first_pass(VP8_COMP *cpi)
         fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;
 
         // don't want to do outputstats with a stack variable!
-        cpi->this_frame_stats = fps;
-        vp8_output_stats(cpi->output_pkt_list, &cpi->this_frame_stats);
-        vp8_accumulate_stats(&cpi->total_stats, &fps);
-
-#ifdef FIRSTPASS_MM
-        fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile);
-#endif
+        memcpy(cpi->this_frame_stats,
+               &fps,
+               sizeof(FIRSTPASS_STATS));
+        memcpy((char*)cpi->this_frame_stats + sizeof(FIRSTPASS_STATS),
+               cpi->fp_motion_map,
+               sizeof(cpi->fp_motion_map[0]) * cpi->common.MBs);
+        vp8_output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
+        vp8_accumulate_stats(cpi->total_stats, &fps);
     }
 
     // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
     if ((cm->current_video_frame > 0) &&
-        (cpi->this_frame_stats.pcnt_inter > 0.20) &&
-        ((cpi->this_frame_stats.intra_error / cpi->this_frame_stats.coded_error) > 2.0))
+        (cpi->this_frame_stats->pcnt_inter > 0.20) &&
+        ((cpi->this_frame_stats->intra_error / cpi->this_frame_stats->coded_error) > 2.0))
     {
         vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
     }
@@ -875,7 +879,7 @@ void vp8_first_pass(VP8_COMP *cpi)
         else
             recon_file = fopen(filename, "ab");
 
-        fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+        if(fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file));
         fclose(recon_file);
     }
 
@@ -1116,33 +1120,33 @@ void vp8_init_second_pass(VP8_COMP *cpi)
 
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
 
-    vp8_zero_stats(&cpi->total_stats);
+    vp8_zero_stats(cpi->total_stats);
 
     if (!cpi->stats_in_end)
         return;
 
-    cpi->total_stats = *cpi->stats_in_end;
+    *cpi->total_stats = *cpi->stats_in_end;
 
-    cpi->total_error_left = cpi->total_stats.ssim_weighted_pred_err;
-    cpi->total_intra_error_left = cpi->total_stats.intra_error;
-    cpi->total_coded_error_left = cpi->total_stats.coded_error;
+    cpi->total_error_left = cpi->total_stats->ssim_weighted_pred_err;
+    cpi->total_intra_error_left = cpi->total_stats->intra_error;
+    cpi->total_coded_error_left = cpi->total_stats->coded_error;
     cpi->start_tot_err_left = cpi->total_error_left;
 
-    //cpi->bits_left = (long long)(cpi->total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
-    //cpi->bits_left -= (long long)(cpi->total_stats.count * two_pass_min_rate / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+    //cpi->bits_left = (long long)(cpi->total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+    //cpi->bits_left -= (long long)(cpi->total_stats->count * two_pass_min_rate / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
 
     // each frame can have a different duration, as the frame rate in the source
     // isn't guaranteed to be constant.   The frame rate prior to the first frame
     // encoded in the second pass is a guess.  However the sum duration is not.
     // Its calculated based on the actual durations of all frames from the first
     // pass.
-    vp8_new_frame_rate(cpi, 10000000.0 * cpi->total_stats.count / cpi->total_stats.duration);
+    vp8_new_frame_rate(cpi, 10000000.0 * cpi->total_stats->count / cpi->total_stats->duration);
 
     cpi->output_frame_rate = cpi->oxcf.frame_rate;
-    cpi->bits_left = (long long)(cpi->total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
-    cpi->bits_left -= (long long)(cpi->total_stats.duration * two_pass_min_rate / 10000000.0);
+    cpi->bits_left = (long long)(cpi->total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+    cpi->bits_left -= (long long)(cpi->total_stats->duration * two_pass_min_rate / 10000000.0);
 
-    vp8_avg_stats(&cpi->total_stats);
+    vp8_avg_stats(cpi->total_stats);
 
     // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
     {
@@ -1158,7 +1162,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
             sum_iiratio += IIRatio;
         }
 
-        cpi->avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->total_stats.count);
+        cpi->avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->total_stats->count);
 
         // Reset file position
         reset_fpf_position(cpi, start_pos);
@@ -1180,21 +1184,11 @@ void vp8_init_second_pass(VP8_COMP *cpi)
 
     }
 
-#ifdef FIRSTPASS_MM
-    cpi->fp_motion_mapfile = 0;
-    cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "rb");
-#endif
-
+    cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
 }
 
 void vp8_end_second_pass(VP8_COMP *cpi)
 {
-#ifdef FIRSTPASS_MM
-
-    if (cpi->fp_motion_mapfile)
-        fclose(cpi->fp_motion_mapfile);
-
-#endif
 }
 
 // Analyse and define a gf/arf group .
@@ -1214,10 +1208,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
     double mv_accumulator_rabs  = 0.0;
     double mv_accumulator_cabs  = 0.0;
-    double this_mv_rabs;
-    double this_mv_cabs;
     double mv_ratio_accumulator = 0.0;
-    double distance_factor = 0.0;
     double decay_accumulator = 1.0;
 
     double boost_factor = IIFACTOR;
@@ -1230,18 +1221,14 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
     int max_bits = frame_max_bits(cpi);    // Max for a single frame
 
-#ifdef FIRSTPASS_MM
-    int fpmm_pos;
-#endif
+    unsigned char *fpmm_pos;
 
     cpi->gf_group_bits = 0;
     cpi->gf_decay_rate = 0;
 
     vp8_clear_system_state();  //__asm emms;
 
-#ifdef FIRSTPASS_MM
     fpmm_pos = vp8_fpmm_get_pos(cpi);
-#endif
 
     start_pos = cpi->stats_in;
 
@@ -1266,9 +1253,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     while (((i < cpi->max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
     {
         double r;
-        double motion_factor;
         double this_frame_mvr_ratio;
         double this_frame_mvc_ratio;
+        double motion_decay;
+        double motion_pct = next_frame.pcnt_motion;
 
         i++;                                                    // Increment the loop counter
 
@@ -1283,12 +1271,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             break;
 
         // Accumulate motion stats.
-        motion_factor = next_frame.pcnt_motion;
-        this_mv_rabs = fabs(next_frame.mvr_abs * motion_factor);
-        this_mv_cabs = fabs(next_frame.mvc_abs * motion_factor);
-
-        mv_accumulator_rabs += fabs(next_frame.mvr_abs * motion_factor);
-        mv_accumulator_cabs += fabs(next_frame.mvc_abs * motion_factor);
+        mv_accumulator_rabs += fabs(next_frame.mvr_abs * motion_pct);
+        mv_accumulator_cabs += fabs(next_frame.mvc_abs * motion_pct);
 
         //Accumulate Motion In/Out of frame stats
         this_frame_mv_in_out = next_frame.mv_in_out_count * next_frame.pcnt_motion;
@@ -1296,13 +1280,23 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         abs_mv_in_out_accumulator += fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
 
         // If there is a significant amount of motion
-        if (motion_factor > 0.05)
+        if (motion_pct > 0.05)
         {
-            this_frame_mvr_ratio = fabs(next_frame.mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVr));
-            this_frame_mvc_ratio = fabs(next_frame.mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVc));
+            this_frame_mvr_ratio = fabs(next_frame.mvr_abs) /
+                                   DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVr));
+
+            this_frame_mvc_ratio = fabs(next_frame.mvc_abs) /
+                                   DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVc));
 
-            mv_ratio_accumulator += (this_frame_mvr_ratio < next_frame.mvr_abs) ? (this_frame_mvr_ratio * motion_factor) : next_frame.mvr_abs * motion_factor;
-            mv_ratio_accumulator += (this_frame_mvc_ratio < next_frame.mvc_abs) ? (this_frame_mvc_ratio * motion_factor) : next_frame.mvc_abs * motion_factor;
+            mv_ratio_accumulator +=
+                (this_frame_mvr_ratio < next_frame.mvr_abs)
+                    ? (this_frame_mvr_ratio * motion_pct)
+                    : next_frame.mvr_abs * motion_pct;
+
+            mv_ratio_accumulator +=
+                (this_frame_mvc_ratio < next_frame.mvc_abs)
+                    ? (this_frame_mvc_ratio * motion_pct)
+                    : next_frame.mvc_abs * motion_pct;
         }
         else
         {
@@ -1330,14 +1324,26 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         loop_decay_rate = next_frame.pcnt_inter;
 
         // High % motion -> somewhat higher decay rate
-        if ((1.0 - (next_frame.pcnt_motion / 10.0)) < loop_decay_rate)
-            loop_decay_rate = (1.0 - (next_frame.pcnt_motion / 10.0));
-
-        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) + (this_mv_cabs * this_mv_cabs)) / 300.0;
-        distance_factor = ((distance_factor > 1.0) ? 0.0 : (1.0 - distance_factor));
+        motion_decay = (1.0 - (motion_pct / 20.0));
+        if (motion_decay < loop_decay_rate)
+            loop_decay_rate = motion_decay;
 
-        if (distance_factor < loop_decay_rate)
-            loop_decay_rate = distance_factor;
+        // Adjustment to decay rate based on speed of motion
+        {
+            double this_mv_rabs;
+            double this_mv_cabs;
+            double distance_factor;
+
+            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
+            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
+
+            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
+                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
+            distance_factor = ((distance_factor > 1.0)
+                                    ? 0.0 : (1.0 - distance_factor));
+            if (distance_factor < loop_decay_rate)
+                loop_decay_rate = distance_factor;
+        }
 
         // Cumulative effect of decay
         decay_accumulator = decay_accumulator * loop_decay_rate;
@@ -1452,6 +1458,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.
         if (tmp_q < cpi->worst_quality)
         {
+            int half_gf_int;
+            int frames_after_arf;
+            int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+            int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+
             cpi->source_alt_ref_pending = TRUE;
 
             // For alt ref frames the error score for the end frame of the group (the alt ref frame) should not contribute to the group total and hence
@@ -1462,22 +1473,63 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             // The future frame itself is part of the next group
             cpi->baseline_gf_interval = i - 1;
 
-#ifdef FIRSTPASS_MM
-            // Read through the motion map to load up the entry for the ARF
+            // Define the arnr filter width for this group of frames:
+            // We only filter frames that lie within a distance of half
+            // the GF interval from the ARF frame. We also have to trap
+            // cases where the filter extends beyond the end of clip.
+            // Note: this_frame->frame has been updated in the loop
+            // so it now points at the ARF frame.
+            half_gf_int = cpi->baseline_gf_interval >> 1;
+            frames_after_arf = cpi->total_stats->count - this_frame->frame - 1;
+
+            switch (cpi->oxcf.arnr_type)
             {
-                int j;
+            case 1: // Backward filter
+                frames_fwd = 0;
+                if (frames_bwd > half_gf_int)
+                    frames_bwd = half_gf_int;
+                break;
+
+            case 2: // Forward filter
+                if (frames_fwd > half_gf_int)
+                    frames_fwd = half_gf_int;
+                if (frames_fwd > frames_after_arf)
+                    frames_fwd = frames_after_arf;
+                frames_bwd = 0;
+                break;
+
+            case 3: // Centered filter
+            default:
+                frames_fwd >>= 1;
+                if (frames_fwd > frames_after_arf)
+                    frames_fwd = frames_after_arf;
+                if (frames_fwd > half_gf_int)
+                    frames_fwd = half_gf_int;
+
+                frames_bwd = frames_fwd;
+
+                // For even length filter there is one more frame backward
+                // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+                if (frames_bwd < half_gf_int)
+                    frames_bwd += (cpi->oxcf.arnr_max_frames+1) & 0x1;
+                break;
+            }
+
+            cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
 
-                // Advance to the region of interest
-                // Current default 2 frames before to 2 frames after the ARF frame itsef
-                vp8_fpmm_reset_pos(cpi, cpi->fpmm_pos);
+            {
+                // Advance to & read in the motion map for those frames
+                // to be considered for filtering based on the position
+                // of the ARF
+                vp8_fpmm_reset_pos(cpi, cpi->fp_motion_map_stats_save);
 
-                for (j = 0; j < cpi->baseline_gf_interval - 2; j++)
-                    vp8_advance_fpmm(cpi, 1);
+                // Position at the 'earliest' frame to be filtered
+                vp8_advance_fpmm(cpi,
+                    cpi->baseline_gf_interval - frames_bwd);
 
                 // Read / create a motion map for the region of interest
-                vp8_input_fpmm(cpi, 5);
+                vp8_input_fpmm(cpi);
             }
-#endif
         }
         else
         {
@@ -1513,7 +1565,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     // Now decide how many bits should be allocated to the GF group as  a proportion of those remaining in the kf group.
     // The final key frame group in the clip is treated as a special case where cpi->kf_group_bits is tied to cpi->bits_left.
     // This is also important for short clips where there may only be one key frame.
-    if (cpi->frames_to_key >= (int)(cpi->total_stats.count - cpi->common.current_video_frame))
+    if (cpi->frames_to_key >= (int)(cpi->total_stats->count - cpi->common.current_video_frame))
     {
         cpi->kf_group_bits = (cpi->bits_left > 0) ? cpi->bits_left : 0;
     }
@@ -1713,10 +1765,8 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         reset_fpf_position(cpi, start_pos);
     }
 
-#ifdef FIRSTPASS_MM
     // Reset the First pass motion map file position
     vp8_fpmm_reset_pos(cpi, fpmm_pos);
-#endif
 }
 
 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.
@@ -1730,7 +1780,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     int max_bits = frame_max_bits(cpi);    // Max for a single frame
 
     // The final few frames have special treatment
-    if (cpi->frames_till_gf_update_due >= (int)(cpi->total_stats.count - cpi->common.current_video_frame))
+    if (cpi->frames_till_gf_update_due >= (int)(cpi->total_stats->count - cpi->common.current_video_frame))
     {
         cpi->gf_group_bits = (cpi->bits_left > 0) ? cpi->bits_left : 0;;
     }
@@ -1775,7 +1825,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 void vp8_second_pass(VP8_COMP *cpi)
 {
     int tmp_q;
-    int frames_left = (int)(cpi->total_stats.count - cpi->common.current_video_frame);
+    int frames_left = (int)(cpi->total_stats->count - cpi->common.current_video_frame);
 
     FIRSTPASS_STATS this_frame;
     FIRSTPASS_STATS this_frame_copy;
@@ -1798,11 +1848,12 @@ void vp8_second_pass(VP8_COMP *cpi)
     if (EOF == vp8_input_stats(cpi, &this_frame))
         return;
 
-#ifdef FIRSTPASS_MM
-    vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs);
-    cpi->fpmm_pos = vp8_fpmm_get_pos(cpi);
-    vp8_advance_fpmm(cpi, 1);         // Read this frame's first pass motion map
-#endif
+    vpx_memset(cpi->fp_motion_map, 0,
+                cpi->oxcf.arnr_max_frames*cpi->common.MBs);
+    cpi->fp_motion_map_stats_save = vp8_fpmm_get_pos(cpi);
+
+    // Step over this frame's first pass motion map
+    vp8_advance_fpmm(cpi, 1);
 
     this_frame_error = this_frame.ssim_weighted_pred_err;
     this_frame_intra_error = this_frame.intra_error;
@@ -2226,6 +2277,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     for (i = 0 ; i < cpi->frames_to_key ; i++)
     {
         double r;
+        double motion_decay;
+        double motion_pct = next_frame.pcnt_motion;
 
         if (EOF == vp8_input_stats(cpi, &next_frame))
             break;
@@ -2239,10 +2292,30 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         //if ( next_frame.pcnt_inter < loop_decay_rate )
         loop_decay_rate = next_frame.pcnt_inter;
 
-        if ((1.0 - (next_frame.pcnt_motion / 10.0)) < loop_decay_rate)
-            loop_decay_rate = (1.0 - (next_frame.pcnt_motion / 10.0));
+        // High % motion -> somewhat higher decay rate
+        motion_decay = (1.0 - (motion_pct / 20.0));
+        if (motion_decay < loop_decay_rate)
+            loop_decay_rate = motion_decay;
+
+        // Adjustment to decay rate based on speed of motion
+        {
+            double this_mv_rabs;
+            double this_mv_cabs;
+            double distance_factor;
+
+            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
+            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
+
+            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
+                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
+            distance_factor = ((distance_factor > 1.0)
+                                    ? 0.0 : (1.0 - distance_factor));
+            if (distance_factor < loop_decay_rate)
+                loop_decay_rate = distance_factor;
+        }
 
         decay_accumulator = decay_accumulator * loop_decay_rate;
+        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
 
         boost_score += (decay_accumulator * r);
 
@@ -2469,7 +2542,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         cpi->common.vert_scale = NORMAL;
 
         // Calculate Average bits per frame.
-        //av_bits_per_frame = cpi->bits_left/(double)(cpi->total_stats.count - cpi->common.current_video_frame);
+        //av_bits_per_frame = cpi->bits_left/(double)(cpi->total_stats->count - cpi->common.current_video_frame);
         av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate);
         //if ( av_bits_per_frame < 0.0 )
         //  av_bits_per_frame = 0.0
@@ -2513,7 +2586,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         if (0)
         {
             FILE *f = fopen("Subsamle.stt", "a");
-            fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",  cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->frames_to_key, cpi->kf_group_bits / cpi->frames_to_key, new_height, new_width);
+            fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",  cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->frames_to_key, (int)(cpi->kf_group_bits / cpi->frames_to_key), new_height, new_width);
             fclose(f);
         }
 
@@ -2532,7 +2605,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         }
         else
         {
-            long long clip_bits = (long long)(cpi->total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+            long long clip_bits = (long long)(cpi->total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
             long long over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
             long long over_spend2 = cpi->oxcf.starting_buffer_level - projected_buffer_level;
 
@@ -2571,7 +2644,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
                 if (0)
                 {
                     FILE *f = fopen("Subsamle.stt", "a");
-                    fprintf(f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n",  kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->frames_to_key, cpi->kf_group_bits / cpi->frames_to_key, new_height, new_width);
+                    fprintf(f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n",  kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->frames_to_key, (int)(cpi->kf_group_bits / cpi->frames_to_key), new_height, new_width);
                     fclose(f);
                 }
             }
diff --git a/vp8/encoder/firstpass.h b/vp8/encoder/firstpass.h
index c7f3e0e45..95e1e5463 100644
--- a/vp8/encoder/firstpass.h
+++ b/vp8/encoder/firstpass.h
@@ -20,4 +20,5 @@ extern void vp8_init_second_pass(VP8_COMP *cpi);
 extern void vp8_second_pass(VP8_COMP *cpi);
 extern void vp8_end_second_pass(VP8_COMP *cpi);
 
+extern size_t vp8_firstpass_stats_sz(unsigned int mb_count);
 #endif
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 1acb73d9c..824af5e46 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -15,6 +15,7 @@
 
 
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
+void vp8_arch_arm_encoder_init(VP8_COMP *cpi);
 
 
 void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
@@ -39,6 +40,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
     cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
     cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
 
+    cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
+    cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
+    cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
+    cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
+    cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
+
     cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
     cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
     cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -56,6 +63,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
     cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
     cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
     cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
+    cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_c;
+    cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_c;
+    cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_c;
     cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_c;
 
     cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
@@ -94,4 +104,8 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
     vp8_arch_x86_encoder_init(cpi);
 #endif
 
+#if ARCH_ARM
+    vp8_arch_arm_encoder_init(cpi);
+#endif
+
 }
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index b89354eaa..bb85afa6f 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -186,7 +186,7 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
 #define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
 #define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
 #define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
-#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
+#define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
 #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
 #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
@@ -195,7 +195,7 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
 
 //#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
 
-int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
 {
     unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
     unsigned char *z = (*(b->base_src) + b->src);
@@ -220,7 +220,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     bestmv->col <<= 3;
 
     // calculate central point error
-    besterr = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
     besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
@@ -309,7 +309,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #undef CHECK_BETTER
 #undef MIN
 #undef MAX
-int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
 {
     int bestmse = INT_MAX;
     MV startmv;
@@ -336,13 +336,13 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     startmv = *bestmv;
 
     // calculate central point error
-    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
     bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
-    left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
     left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
@@ -352,7 +352,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     }
 
     this_mv.col += 8;
-    right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
     right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
@@ -364,7 +364,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     // go up then down and check error
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
-    up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
     up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
@@ -374,7 +374,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     }
 
     this_mv.row += 8;
-    down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
     down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
@@ -386,10 +386,6 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
 
     // now check 1 more diagonal
     whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-    // whichdir must be 0-4. Therefore, one of the cases below
-    // must run through. However, because there is no default
-    // and diag is not set elsewhere, we get a compile warning
-    diag = 0;
     //for(whichdir =0;whichdir<4;whichdir++)
     //{
     this_mv = startmv;
@@ -399,22 +395,22 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     case 0:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 1:
         this_mv.col += 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 2:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row += 4;
-        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 3:
         this_mv.col += 4;
         this_mv.row += 4;
-        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
         break;
     }
 
@@ -446,12 +442,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     if (startmv.col & 7)
     {
         this_mv.col = startmv.col - 2;
-        left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        left = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     }
     else
     {
         this_mv.col = (startmv.col - 8) | 6;
-        left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
+        left = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
     }
 
     left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
@@ -463,7 +459,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     }
 
     this_mv.col += 4;
-    right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    right = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
@@ -478,12 +474,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     if (startmv.row & 7)
     {
         this_mv.row = startmv.row - 2;
-        up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        up = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     }
     else
     {
         this_mv.row = (startmv.row - 8) | 6;
-        up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+        up = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
     }
 
     up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
@@ -495,7 +491,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     }
 
     this_mv.row += 4;
-    down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    down = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
@@ -523,12 +519,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
             if (startmv.col & 7)
             {
                 this_mv.col -= 2;
-                diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+                diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
             }
             else
             {
                 this_mv.col = (startmv.col - 8) | 6;
-                diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+                diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
             }
         }
         else
@@ -538,12 +534,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
             if (startmv.col & 7)
             {
                 this_mv.col -= 2;
-                diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+                diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
             }
             else
             {
                 this_mv.col = (startmv.col - 8) | 6;
-                diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
+                diag = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
             }
         }
 
@@ -554,12 +550,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
         if (startmv.row & 7)
         {
             this_mv.row -= 2;
-            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         }
         else
         {
             this_mv.row = (startmv.row - 8) | 6;
-            diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+            diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
         }
 
         break;
@@ -569,19 +565,19 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
         if (startmv.col & 7)
         {
             this_mv.col -= 2;
-            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         }
         else
         {
             this_mv.col = (startmv.col - 8) | 6;
-            diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+            diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
         }
 
         break;
     case 3:
         this_mv.col += 2;
         this_mv.row += 2;
-        diag = svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        diag = vfp->svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         break;
     }
 
@@ -598,7 +594,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
     return bestmse;
 }
 
-int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
 {
     int bestmse = INT_MAX;
     MV startmv;
@@ -623,13 +619,13 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     startmv = *bestmv;
 
     // calculate central point error
-    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
     bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
-    left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
     left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
@@ -639,7 +635,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     }
 
     this_mv.col += 8;
-    right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
     right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
@@ -651,7 +647,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     // go up then down and check error
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
-    up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
     up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
@@ -661,7 +657,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     }
 
     this_mv.row += 8;
-    down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
     down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
@@ -681,22 +677,22 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     case 0:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
         break;
     case 1:
         this_mv.col += 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
         break;
     case 2:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row += 4;
-        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
         break;
     case 3:
         this_mv.col += 4;
         this_mv.row += 4;
-        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
         break;
     }
 
@@ -711,7 +707,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
 #else
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = (this_mv.row - 8) | 4;
-    diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
     diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
@@ -721,7 +717,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     }
 
     this_mv.col += 8;
-    diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
     diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
@@ -732,7 +728,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
 
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = startmv.row + 4;
-    diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
     diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
@@ -742,7 +738,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
     }
 
     this_mv.col += 8;
-    diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
     diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
@@ -758,10 +754,18 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
 
 #define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
 #define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
-#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
+#define DIST(r,c,v) vfp->sdf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
 #define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
 #define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
-
+static const MV next_chkpts[6][3] =
+{
+    {{ -2, 0}, { -1, -2}, {1, -2}},
+    {{ -1, -2}, {1, -2}, {2, 0}},
+    {{1, -2}, {2, 0}, {1, 2}},
+    {{2, 0}, {1, 2}, { -1, 2}},
+    {{1, 2}, { -1, 2}, { -2, 0}},
+    {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
 int vp8_hex_search
 (
     MACROBLOCK *x,
@@ -772,44 +776,72 @@ int vp8_hex_search
     int search_param,
     int error_per_bit,
     int *num00,
-    vp8_variance_fn_t vf,
-    vp8_sad_fn_t      sf,
+    const vp8_variance_fn_ptr_t *vfp,
     int *mvsadcost[2],
     int *mvcost[2]
 )
 {
-    MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ;
+    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
     MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
     int i, j;
     unsigned char *src = (*(b->base_src) + b->src);
     int src_stride = b->src_stride;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr, bc = rc, tr, tc;
+    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
     unsigned int besterr, thiserr = 0x7fffffff;
+    int k = -1, tk;
 
-    if (rc < x->mv_col_min) bc = x->mv_col_min;
+    if (bc < x->mv_col_min) bc = x->mv_col_min;
 
-    if (rc > x->mv_col_max) bc = x->mv_col_max;
+    if (bc > x->mv_col_max) bc = x->mv_col_max;
 
-    if (rr < x->mv_row_min) br = x->mv_row_min;
+    if (br < x->mv_row_min) br = x->mv_row_min;
 
-    if (rr > x->mv_row_max) br = x->mv_row_max;
+    if (br > x->mv_row_max) br = x->mv_row_max;
 
     rr >>= 1;
     rc >>= 1;
-    br >>= 3;
-    bc >>= 3;
 
     besterr = ERR(br, bc, thiserr);
 
-    // hex search  jbb changed to 127 to avoid max 256 problem steping by 2.
-    for (j = 0; j < 127; j++)
+    // hex search
+    //j=0
+    tr = br;
+    tc = bc;
+
+    for (i = 0; i < 6; i++)
+    {
+        int nr = tr + hex[i].row, nc = tc + hex[i].col;
+
+        if (nc < x->mv_col_min) continue;
+
+        if (nc > x->mv_col_max) continue;
+
+        if (nr < x->mv_row_min) continue;
+
+        if (nr > x->mv_row_max) continue;
+
+        //CHECK_BETTER(thiserr,nr,nc);
+        if ((thiserr = ERR(nr, nc, besterr)) < besterr)
+        {
+            besterr = thiserr;
+            br = nr;
+            bc = nc;
+            k = i;
+        }
+    }
+
+    if (tr == br && tc == bc)
+        goto cal_neighbors;
+
+    for (j = 1; j < 127; j++)
     {
         tr = br;
         tc = bc;
+        tk = k;
 
-        for (i = 0; i < 6; i++)
+        for (i = 0; i < 3; i++)
         {
-            int nr = tr + hex[i].row, nc = tc + hex[i].col;
+            int nr = tr + next_chkpts[tk][i].row, nc = tc + next_chkpts[tk][i].col;
 
             if (nc < x->mv_col_min) continue;
 
@@ -819,7 +851,17 @@ int vp8_hex_search
 
             if (nr > x->mv_row_max) continue;
 
-            CHECK_BETTER(thiserr, nr, nc);
+            //CHECK_BETTER(thiserr,nr,nc);
+            if ((thiserr = ERR(nr, nc, besterr)) < besterr)
+            {
+                besterr = thiserr;
+                br = nr;
+                bc = nc; //k=(tk+5+i)%6;}
+                k = tk + 5 + i;
+
+                if (k >= 12) k -= 12;
+                else if (k >= 6) k -= 6;
+            }
         }
 
         if (tr == br && tc == bc)
@@ -827,6 +869,7 @@ int vp8_hex_search
     }
 
     // check 8 1 away neighbors
+cal_neighbors:
     tr = br;
     tc = bc;
 
@@ -848,7 +891,7 @@ int vp8_hex_search
     best_mv->row = br;
     best_mv->col = bc;
 
-    return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
 }
 #undef MVC
 #undef PRE
@@ -856,6 +899,8 @@ int vp8_hex_search
 #undef DIST
 #undef ERR
 #undef CHECK_BETTER
+
+
 int vp8_diamond_search_sad
 (
     MACROBLOCK *x,
@@ -1035,84 +1080,73 @@ int vp8_diamond_search_sadx4
 
     for (step = 0; step < tot_steps ; step++)
     {
-        int check_row_min, check_col_min, check_row_max, check_col_max;
+        int all_in = 1, t;
 
-        check_row_min = x->mv_row_min - best_mv->row;
-        check_row_max = x->mv_row_max - best_mv->row;
-        check_col_min = x->mv_col_min - best_mv->col;
-        check_col_max = x->mv_col_max - best_mv->col;
+        // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
+        // checking 4 bounds for each points.
+        all_in &= ((best_mv->row + ss[i].mv.row)> x->mv_row_min);
+        all_in &= ((best_mv->row + ss[i+1].mv.row) < x->mv_row_max);
+        all_in &= ((best_mv->col + ss[i+2].mv.col) > x->mv_col_min);
+        all_in &= ((best_mv->col + ss[i+3].mv.col) < x->mv_col_max);
 
-        for (j = 0 ; j < x->searches_per_step ; j += 4)
+        if (all_in)
         {
-            unsigned char *block_offset[4];
-            unsigned int valid_block[4];
-            int all_in = 1, t;
+            unsigned int sad_array[4];
 
-            for (t = 0; t < 4; t++)
+            for (j = 0 ; j < x->searches_per_step ; j += 4)
             {
-                valid_block [t]  = (ss[t+i].mv.col > check_col_min);
-                valid_block [t] &= (ss[t+i].mv.col < check_col_max);
-                valid_block [t] &= (ss[t+i].mv.row > check_row_min);
-                valid_block [t] &= (ss[t+i].mv.row < check_row_max);
-
-                all_in &= valid_block[t];
-                block_offset[t] = ss[i+t].offset + best_address;
-            }
+                unsigned char *block_offset[4];
 
-            if (all_in)
-            {
-                unsigned int sad_array[4];
+                for (t = 0; t < 4; t++)
+                    block_offset[t] = ss[i+t].offset + best_address;
 
                 fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
 
                 for (t = 0; t < 4; t++, i++)
                 {
-                    thissad = sad_array[t];
-
-                    if (thissad < bestsad)
+                    if (sad_array[t] < bestsad)
                     {
                         this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
                         this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        sad_array[t] += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
 
-                        if (thissad < bestsad)
+                        if (sad_array[t] < bestsad)
                         {
-                            bestsad = thissad;
+                            bestsad = sad_array[t];
                             best_site = i;
                         }
                     }
                 }
             }
-            else
+        }
+        else
+        {
+            for (j = 0 ; j < x->searches_per_step ; j++)
             {
-                int t;
+                // Trap illegal vectors
+                this_row_offset = best_mv->row + ss[i].mv.row;
+                this_col_offset = best_mv->col + ss[i].mv.col;
 
-                for (t = 0; t < 4; i++, t++)
+                if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+                (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
                 {
-                    // Trap illegal vectors
-                    if (valid_block[t])
+                    check_here = ss[i].offset + best_address;
+                    thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
 
+                    if (thissad < bestsad)
                     {
-                        check_here = block_offset[t];
-                        thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+                        this_mv.row = this_row_offset << 3;
+                        this_mv.col = this_col_offset << 3;
+                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
 
                         if (thissad < bestsad)
                         {
-                            this_row_offset = best_mv->row + ss[i].mv.row;
-                            this_col_offset = best_mv->col + ss[i].mv.col;
-
-                            this_mv.row = this_row_offset << 3;
-                            this_mv.col = this_col_offset << 3;
-                            thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                            if (thissad < bestsad)
-                            {
-                                bestsad = thissad;
-                                best_site = i;
-                            }
+                            bestsad = thissad;
+                            best_site = i;
                         }
                     }
                 }
+                i++;
             }
         }
 
@@ -1138,6 +1172,7 @@ int vp8_diamond_search_sadx4
 }
 
 
+#if !(CONFIG_REALTIME_ONLY)
 int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
 {
     unsigned char *what = (*(b->base_src) + b->src);
@@ -1288,7 +1323,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
         check_here = r * mv_stride + in_what + col_min;
         c = col_min;
 
-        while ((c + 3) < col_max)
+        while ((c + 2) < col_max)
         {
             int i;
 
@@ -1350,6 +1385,160 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
     else
         return INT_MAX;
 }
+#endif
+
+
+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    int mv_stride = d->pre_stride;
+    unsigned char *bestaddress;
+    MV *best_mv = &d->bmi.mv.as_mv;
+    MV this_mv;
+    int bestsad = INT_MAX;
+    int r, c;
+
+    unsigned char *check_here;
+    unsigned int thissad;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    unsigned short sad_array8[8];
+    unsigned int sad_array[3];
+
+    // Work out the mid point for the search
+    in_what = *(d->base_pre) + d->pre;
+    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Baseline value at the centre
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.row = r << 3;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 7) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
+
+            for (i = 0; i < 8; i++)
+            {
+                thissad = (unsigned int)sad_array8[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.col = c << 3;
+                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->row = r;
+                        best_mv->col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while ((c + 2) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.col = c << 3;
+                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->row = r;
+                        best_mv->col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+            if (thissad < bestsad)
+            {
+                this_mv.col = c << 3;
+                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->row = r;
+                    best_mv->col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad < INT_MAX)
+        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    else
+        return INT_MAX;
+}
 
 #ifdef ENTROPY_STATS
 void print_mode_context(void)
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 7cc924279..7d6036248 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -42,14 +42,15 @@ extern int vp8_hex_search
     int search_param,
     int error_per_bit,
     int *num00,
-    vp8_variance_fn_t vf,
-    vp8_sad_fn_t sf,
+    const vp8_variance_fn_ptr_t *vf,
     int *mvsadcost[2],
     int *mvcost[2]
 
 );
 
-typedef int (fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]);
+typedef int (fractional_mv_step_fp)
+    (MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv,
+     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]);
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
 extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
@@ -92,6 +93,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
 typedef prototype_full_search_sad(*vp8_full_search_fn_t);
 extern prototype_full_search_sad(vp8_full_search_sad);
 extern prototype_full_search_sad(vp8_full_search_sadx3);
+extern prototype_full_search_sad(vp8_full_search_sadx8);
 
 typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
 extern prototype_diamond_search_sad(vp8_diamond_search_sad);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 56bac0aaa..5f02a5a02 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -30,6 +30,10 @@
 #include "threading.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpxerrors.h"
+#include "temporal_filter.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
 
 #include <math.h>
 #include <stdio.h>
@@ -70,7 +74,7 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const
 int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
 
 
-static void mode_ref_lf_test_function(VP8_COMP *cpi);
+static void set_default_lf_deltas(VP8_COMP *cpi);
 
 extern const int vp8_gf_interval_table[101];
 
@@ -147,6 +151,95 @@ extern const int qzbin_factors[129];
 extern void vp8cx_init_quantizer(VP8_COMP *cpi);
 extern const int vp8cx_base_skip_false_prob[128];
 
+// Tables relating active max Q to active min Q
+static const int kf_low_motion_minq[QINDEX_RANGE] =
+{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4,
+    5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 10,10,
+    11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,
+    19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,
+    27,27,28,28,29,29,30,30,31,32,33,34,35,36,37,38,
+};
+static const int kf_high_motion_minq[QINDEX_RANGE] =
+{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+    2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+    6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10,10,
+    11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,
+    19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,
+    27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,
+    35,35,36,36,37,38,39,40,41,42,43,44,45,46,47,48,
+};
+/*static const int kf_minq[QINDEX_RANGE] =
+{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6,
+    7, 7, 8, 8, 9, 9, 10,10,11,11,12,12,13,13,14,14,
+    15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,
+    23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,
+    31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38
+};*/
+static const int gf_low_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+    3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,
+    7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,
+    11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,
+    19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,
+    27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,
+    35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,
+    43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
+};
+static const int gf_mid_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
+    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+    9,10,10,10,10,11,11,11,12,12,12,12,13,13,13,14,
+    14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,
+    22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,
+    30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,
+    38,39,39,40,40,41,41,42,42,43,43,44,45,46,47,48,
+    49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,
+};
+static const int gf_high_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,2,2,2,3,3,3,4,
+    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+    9,10,10,10,11,11,12,12,13,13,14,14,15,15,16,16,
+    17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,
+    25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,
+    33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,
+    41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
+    55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80,
+};
+/*static const int gf_arf_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
+    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+    9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,
+    15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,
+    23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,
+    31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,39,
+    39,40,40,41,41,42,42,43,43,44,45,46,47,48,49,50,
+    51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66
+};*/
+static const int inter_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,2,3,3,4,4,5,6,6,7,7,
+    8,8,9,9,10,11,11,12,12,13,13,14,14,15,15,16,
+    16,17,17,17,18,18,19,19,20,20,21,21,22,22,22,23,
+    23,24,24,24,25,25,26,27,28,28,29,30,31,32,33,34,
+    35,35,36,37,38,39,39,40,41,42,43,43,44,45,46,47,
+    47,48,49,49,51,52,53,54,54,55,56,56,57,57,58,58,
+    59,59,60,61,61,62,62,63,64,64,65,66,67,67,68,69,
+    69,70,71,71,72,73,74,75,76,76,77,78,79,80,81,81,
+};
 
 void vp8_initialize()
 {
@@ -180,9 +273,10 @@ static void setup_features(VP8_COMP *cpi)
     cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
     vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
     vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    vpx_memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    vpx_memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
 
-    // jbb trial !
-    mode_ref_lf_test_function(cpi);
+    set_default_lf_deltas(cpi);
 
 }
 
@@ -237,6 +331,8 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi)
 
     cpi->mb.pip = 0;
 
+    vpx_free(cpi->total_stats);
+    vpx_free(cpi->this_frame_stats);
 }
 
 static void enable_segmentation(VP8_PTR ptr)
@@ -440,7 +536,7 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
 
 }
 
-static void mode_ref_lf_test_function(VP8_COMP *cpi)
+static void set_default_lf_deltas(VP8_COMP *cpi)
 {
     cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
     cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -569,7 +665,6 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         break;
     case 1:
     case 3:
-        sf->optimize_coefficients = 0;
         sf->thresh_mult[THR_NEARESTMV] = 0;
         sf->thresh_mult[THR_ZEROMV   ] = 0;
         sf->thresh_mult[THR_DC       ] = 0;
@@ -630,6 +725,9 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
         if (Speed > 0)
         {
+            // Disable coefficient optimization above speed 0
+            sf->optimize_coefficients = 0;
+
             cpi->mode_check_freq[THR_SPLITG] = 4;
             cpi->mode_check_freq[THR_SPLITA] = 4;
             cpi->mode_check_freq[THR_SPLITMV] = 2;
@@ -1187,7 +1285,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     }
 
     if (cpi->sf.optimize_coefficients == 1)
-        cpi->mb.optimize = 1;
+        cpi->mb.optimize = 1 + cpi->is_next_src_alt_ref;
     else
         cpi->mb.optimize = 0;
 
@@ -1297,6 +1395,12 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
     CHECK_MEM_ERROR(cpi->gf_active_flags, vpx_calloc(1, cm->mb_rows * cm->mb_cols));
 
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+    cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+    cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+    if(!cpi->total_stats || !cpi->this_frame_stats)
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate firstpass stats");
 }
 
 
@@ -1325,16 +1429,14 @@ int vp8_reverse_trans(int x)
 };
 void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
 {
+    if(framerate < .1)
+        framerate = 30;
+
     cpi->oxcf.frame_rate             = framerate;
     cpi->output_frame_rate            = cpi->oxcf.frame_rate;
     cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
     cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
     cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
-    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
-    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
     cpi->max_gf_interval = (int)(cpi->output_frame_rate / 2) + 2;
 
     //cpi->max_gf_interval = (int)(cpi->output_frame_rate * 2 / 3) + 1;
@@ -1580,6 +1682,10 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
     cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
 
+    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
+    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
 
     cpi->total_actual_bits            = 0;
     cpi->total_target_vs_actual        = 0;
@@ -1655,6 +1761,7 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     // YX Temp
     cpi->last_alt_ref_sei    = -1;
     cpi->is_src_frame_alt_ref = 0;
+    cpi->is_next_src_alt_ref = 0;
 
 #if 0
     // Experimental RD Code
@@ -1663,13 +1770,16 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 #endif
 
 #if VP8_TEMPORAL_ALT_REF
+
+    cpi->use_weighted_temporal_filter = 0;
+
     {
         int i;
 
         cpi->fixed_divide[0] = 0;
 
-        for (i = 1; i < 255; i++)
-            cpi->fixed_divide[i] = 0x10000 / i;
+        for (i = 1; i < 512; i++)
+            cpi->fixed_divide[i] = 0x80000 / i;
     }
 #endif
 }
@@ -1858,6 +1968,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
     cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
 
+    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
+    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
 
     cpi->total_actual_bits            = 0;
     cpi->total_target_vs_actual        = 0;
@@ -1933,6 +2047,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     // YX Temp
     cpi->last_alt_ref_sei    = -1;
     cpi->is_src_frame_alt_ref = 0;
+    cpi->is_next_src_alt_ref = 0;
 
 #if 0
     // Experimental RD Code
@@ -1997,8 +2112,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     CHECK_MEM_ERROR(cpi->rdtok, vpx_calloc(256 * 3 / 2, sizeof(TOKENEXTRA)));
     CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
 
-    vp8_cmachine_specific_config(cpi);
     vp8_create_common(&cpi->common);
+    vp8_cmachine_specific_config(cpi);
 
     vp8_init_config((VP8_PTR)cpi, oxcf);
 
@@ -2039,7 +2154,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     cpi->active_map_enabled = 0;
 
     // Create the first pass motion map structure and set to 0
-    CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(cpi->common.MBs, 1));
+    // Allocate space for maximum of 15 buffers
+    CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(15*cpi->common.MBs, 1));
 
 #if 0
     // Experimental code for lagged and one pass
@@ -2081,19 +2197,11 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     // Test function for segmentation
     //segmentation_test_function((VP8_PTR) cpi);
 
-    // Loop filter mode / ref deltas test function
-    //mode_ref_lf_test_function(cpi);
-
 #ifdef ENTROPY_STATS
     init_context_counters();
 #endif
 
 
-#ifdef INTRARDOPT
-    cpi->intra_rd_opt = 1;
-
-#endif
-
     cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
     cpi->key_frame_frequency = cpi->oxcf.key_freq;
 
@@ -2194,10 +2302,12 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     }
     else if (cpi->pass == 2)
     {
+        size_t packet_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+        int packets = oxcf->two_pass_stats_in.sz / packet_sz;
+
         cpi->stats_in = oxcf->two_pass_stats_in.buf;
-        cpi->stats_in_end = cpi->stats_in
-                            + oxcf->two_pass_stats_in.sz / sizeof(FIRSTPASS_STATS)
-                            - 1;
+        cpi->stats_in_end = (void*)((char *)cpi->stats_in
+                            + (packets - 1) * packet_sz);
         vp8_init_second_pass(cpi);
     }
 
@@ -2224,11 +2334,55 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
 
     vp8cx_create_encoder_threads(cpi);
 
-    cpi->fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
-    cpi->fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
-    cpi->fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
-    cpi->fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
-    cpi->fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
+    cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
+    cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
+    cpi->fn_ptr[BLOCK_16X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_h);
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
+    cpi->fn_ptr[BLOCK_16X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
+    cpi->fn_ptr[BLOCK_16X16].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x8);
+    cpi->fn_ptr[BLOCK_16X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
+
+    cpi->fn_ptr[BLOCK_16X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
+    cpi->fn_ptr[BLOCK_16X8].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x8);
+    cpi->fn_ptr[BLOCK_16X8].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x8);
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_16X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
+    cpi->fn_ptr[BLOCK_16X8].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x8);
+    cpi->fn_ptr[BLOCK_16X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
+
+    cpi->fn_ptr[BLOCK_8X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
+    cpi->fn_ptr[BLOCK_8X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x16);
+    cpi->fn_ptr[BLOCK_8X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x16);
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_8X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
+    cpi->fn_ptr[BLOCK_8X16].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x8);
+    cpi->fn_ptr[BLOCK_8X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
+
+    cpi->fn_ptr[BLOCK_8X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
+    cpi->fn_ptr[BLOCK_8X8].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x8);
+    cpi->fn_ptr[BLOCK_8X8].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x8);
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_8X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
+    cpi->fn_ptr[BLOCK_8X8].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x8);
+    cpi->fn_ptr[BLOCK_8X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
+
+    cpi->fn_ptr[BLOCK_4X4].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
+    cpi->fn_ptr[BLOCK_4X4].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var4x4);
+    cpi->fn_ptr[BLOCK_4X4].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar4x4);
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_4X4].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
+    cpi->fn_ptr[BLOCK_4X4].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
+    cpi->fn_ptr[BLOCK_4X4].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
 
 #if !(CONFIG_REALTIME_ONLY)
     cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
@@ -2422,6 +2576,7 @@ void vp8_remove_compressor(VP8_PTR *ptr)
             }
 
             fprintf(fmode, "};\n");
+            fclose(fmode);
         }
 #endif
 
@@ -2676,6 +2831,8 @@ int vp8_update_entropy(VP8_PTR comp, int update)
     return 0;
 }
 
+
+#if OUTPUT_YUV_SRC
 void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s)
 {
     FILE *yuv_file = fopen(name, "ab");
@@ -2711,6 +2868,8 @@ void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s)
 
     fclose(yuv_file);
 }
+#endif
+
 
 static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 {
@@ -2744,9 +2903,20 @@ static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
     {
         //vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
 #if HAVE_ARMV7
-        vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source);
-#else
-        vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source);
+        }
+#if CONFIG_RUNTIME_CPU_DETECT
+        else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+        {
+            vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
+        }
 #endif
 
         cpi->Source = &cpi->scaled_source;
@@ -3261,324 +3431,15 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
 }
 #endif
 // return of 0 means drop frame
-#define USE_FILTER_LUT 1
-#if VP8_TEMPORAL_ALT_REF
 
-#if USE_FILTER_LUT
-static int modifier_lut[7][19] =
-{
-    // Strength=0
-    {16, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=1
-    {16, 15, 10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=2
-    {16, 15, 13, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=3
-    {16, 16, 15, 13, 10, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=4
-    {16, 16, 15, 14, 13, 11, 9, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    // Strength=5
-    {16, 16, 16, 15, 15, 14, 13, 11, 10, 8, 7, 5, 3, 0, 0, 0, 0, 0, 0},
-    // Strength=6
-    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1}
-};
-#endif
-static void vp8cx_temp_blur1_c
+static void encode_frame_to_data_rate
 (
     VP8_COMP *cpi,
-    unsigned char **frames,
-    int frame_count,
-    unsigned char *src,
-    unsigned char *dst,
-    int width,
-    int stride,
-    int height,
-    int strength,
-    int *fixed_divide,
-    unsigned char *motion_map_ptr,
-    unsigned char block_size
+    unsigned long *size,
+    unsigned char *dest,
+    unsigned int *frame_flags
 )
 {
-    int byte = 0;         // Buffer offset for current pixel being filtered
-    int frame = 0;
-    int modifier = 0;
-    int i, j, k;
-    int block_ofset;
-    int cols;
-    unsigned char Shift = (block_size == 16) ? 4 : 3;
-#if USE_FILTER_LUT
-    int *lut = modifier_lut[strength];
-#endif
-
-    cols = cpi->common.mb_cols;
-
-    for (i = 0; i < height; i++)
-    {
-        block_ofset = (i >> Shift) * cols;
-
-        for (j = 0; j < cols; j ++)
-        {
-            if (motion_map_ptr[block_ofset] > 2)
-            {
-                vpx_memcpy(&dst[byte], &src[byte], block_size);
-                byte += block_size;
-            }
-            else
-            {
-                for (k = 0; k < block_size; k++)
-                {
-                    int accumulator = 0;
-                    int count = 0;
-                    int src_byte = src[byte];
-
-                    for (frame = 0; frame < frame_count; frame++)
-                    {
-                        // get current frame pixel value
-                        int pixel_value = frames[frame][byte];
-#if USE_FILTER_LUT
-                        // LUT implementation --
-                        // improves precision of filter
-                        modifier = abs(src_byte-pixel_value);
-                        modifier = modifier>18 ? 0 : lut[modifier];
-#else
-                        modifier   = src_byte;
-                        modifier  -= pixel_value;
-                        modifier  *= modifier;
-                        modifier >>= strength;
-                        modifier  *= 3;
-
-                        if (modifier > 16)
-                            modifier = 16;
-
-                        modifier = 16 - modifier;
-#endif
-                        accumulator += modifier * pixel_value;
-
-                        count += modifier;
-                    }
-
-                    accumulator += (count >> 1);
-                    accumulator *= fixed_divide[count];
-                    accumulator >>= 16;
-
-                    dst[byte] = accumulator;
-
-                    // move to next pixel
-                    byte++;
-                }
-            }
-
-            block_ofset++;
-        }
-
-        // Step byte on over the UMV border to the start of the next line
-        byte += stride - width;
-    }
-}
-
-static void vp8cx_temp_filter_c
-(
-    VP8_COMP *cpi
-)
-{
-    YV12_BUFFER_CONFIG *temp_source_buffer;
-    int *fixed_divide = cpi->fixed_divide;
-
-    int frame = 0;
-    int max_frames = 11;
-
-    int num_frames_backward = 0;
-    int num_frames_forward = 0;
-    int frames_to_blur_backward = 0;
-    int frames_to_blur_forward = 0;
-    int frames_to_blur = 0;
-    int start_frame = 0;
-
-    int strength = cpi->oxcf.arnr_strength;
-
-    int blur_type = cpi->oxcf.arnr_type;
-
-    int new_max_frames = cpi->oxcf.arnr_max_frames;
-
-    if (new_max_frames > 0)
-        max_frames = new_max_frames;
-
-    num_frames_backward = cpi->last_alt_ref_sei - cpi->source_encode_index;
-
-    if (num_frames_backward < 0)
-        num_frames_backward += cpi->oxcf.lag_in_frames;
-
-    num_frames_forward = cpi->oxcf.lag_in_frames - (num_frames_backward + 1);
-
-    switch (blur_type)
-    {
-    case 1:
-        /////////////////////////////////////////
-        // Backward Blur
-
-        frames_to_blur_backward = num_frames_backward;
-
-        if (frames_to_blur_backward >= max_frames)
-            frames_to_blur_backward = max_frames - 1;
-
-        frames_to_blur = frames_to_blur_backward + 1;
-        break;
-
-    case 2:
-        /////////////////////////////////////////
-        // Forward Blur
-
-        frames_to_blur_forward = num_frames_forward;
-
-        if (frames_to_blur_forward >= max_frames)
-            frames_to_blur_forward = max_frames - 1;
-
-        frames_to_blur = frames_to_blur_forward + 1;
-        break;
-
-    case 3:
-        /////////////////////////////////////////
-        // Center Blur
-        frames_to_blur_forward = num_frames_forward;
-        frames_to_blur_backward = num_frames_backward;
-
-        if (frames_to_blur_forward > frames_to_blur_backward)
-            frames_to_blur_forward = frames_to_blur_backward;
-
-        if (frames_to_blur_backward > frames_to_blur_forward)
-            frames_to_blur_backward = frames_to_blur_forward;
-
-        if (frames_to_blur_forward > (max_frames / 2))
-            frames_to_blur_forward = (max_frames / 2);
-
-        if (frames_to_blur_backward > (max_frames / 2))
-            frames_to_blur_backward = (max_frames / 2);
-
-        frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
-        break;
-
-    default:
-        /////////////////////////////////////////
-        // At most 4 frames forward Blur
-        frames_to_blur_forward = 4;
-        frames_to_blur_backward = num_frames_backward;
-
-        if (max_frames > 5)
-        {
-            if ((frames_to_blur_backward + frames_to_blur_forward) >= max_frames)
-            {
-                frames_to_blur_backward
-                    = max_frames - frames_to_blur_forward - 1;
-            }
-        }
-        else
-        {
-            frames_to_blur_forward = max_frames - 1;
-            frames_to_blur_backward = 0;
-        }
-
-        frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
-        break;
-    }
-
-    start_frame = (cpi->last_alt_ref_sei + frames_to_blur_forward) % cpi->oxcf.lag_in_frames;
-
-#ifdef DEBUGFWG
-    // DEBUG FWG
-    printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
-           , max_frames
-           , num_frames_backward
-           , num_frames_forward
-           , frames_to_blur
-           , frames_to_blur_backward
-           , frames_to_blur_forward
-           , cpi->source_encode_index
-           , cpi->last_alt_ref_sei
-           , start_frame);
-#endif
-
-    for (frame = 0; frame < frames_to_blur; frame++)
-    {
-        int which_buffer =  start_frame - frame;
-
-        if (which_buffer < 0)
-            which_buffer += cpi->oxcf.lag_in_frames;
-
-        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.y_buffer;
-    }
-
-    temp_source_buffer = &cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer;
-
-    // Blur Y
-    vp8cx_temp_blur1_c(
-        cpi,
-        cpi->frames,
-        frames_to_blur,
-        temp_source_buffer->y_buffer,  // cpi->Source->y_buffer,
-        cpi->alt_ref_buffer.source_buffer.y_buffer,  // cpi->Source->y_buffer,
-        temp_source_buffer->y_width,
-        temp_source_buffer->y_stride,
-        temp_source_buffer->y_height,
-        //temp_source_buffer->y_height * temp_source_buffer->y_stride,
-        strength,
-        fixed_divide,
-        cpi->fp_motion_map, 16);
-
-    for (frame = 0; frame < frames_to_blur; frame++)
-    {
-        int which_buffer =  start_frame - frame;
-
-        if (which_buffer < 0)
-            which_buffer += cpi->oxcf.lag_in_frames;
-
-        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.u_buffer;
-    }
-
-    // Blur U
-    vp8cx_temp_blur1_c(
-        cpi,
-        cpi->frames,
-        frames_to_blur,
-        temp_source_buffer->u_buffer,
-        cpi->alt_ref_buffer.source_buffer.u_buffer,  // cpi->Source->u_buffer,
-        temp_source_buffer->uv_width,
-        temp_source_buffer->uv_stride,
-        temp_source_buffer->uv_height,
-        //temp_source_buffer->uv_height * temp_source_buffer->uv_stride,
-        strength,
-        fixed_divide,
-        cpi->fp_motion_map, 8);
-
-    for (frame = 0; frame < frames_to_blur; frame++)
-    {
-        int which_buffer =  start_frame - frame;
-
-        if (which_buffer < 0)
-            which_buffer += cpi->oxcf.lag_in_frames;
-
-        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.v_buffer;
-    }
-
-    // Blur V
-    vp8cx_temp_blur1_c(
-        cpi,
-        cpi->frames,
-        frames_to_blur,
-        temp_source_buffer->v_buffer,
-        cpi->alt_ref_buffer.source_buffer.v_buffer,  // cpi->Source->v_buffer,
-        temp_source_buffer->uv_width,
-        temp_source_buffer->uv_stride,
-        //temp_source_buffer->uv_height * temp_source_buffer->uv_stride,
-        temp_source_buffer->uv_height,
-        strength,
-        fixed_divide,
-        cpi->fp_motion_map, 8);
-}
-#endif
-
-
-static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
-{
     int Q;
     int frame_over_shoot_limit;
     int frame_under_shoot_limit;
@@ -3662,6 +3523,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
     {
         int i;
 
+        // Reset the loop filter deltas and segmentation map
+        setup_features(cpi);
+
         // If segmentation is enabled force a map update for key frames
         if (cpi->mb.e_mbd.segmentation_enabled)
         {
@@ -3669,12 +3533,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
             cpi->mb.e_mbd.update_mb_segmentation_data = 1;
         }
 
-        // If mode or reference frame based loop filter deltas are enabled then force an update for key frames.
-        if (cpi->mb.e_mbd.mode_ref_lf_delta_enabled)
-        {
-            cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
-        }
-
         // The alternate reference frame cannot be active for a key frame
         cpi->source_alt_ref_active = FALSE;
 
@@ -3827,87 +3685,49 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
     // Set an active best quality and if necessary active worst quality
     if (cpi->pass == 2 || (cm->current_video_frame > 150))
     {
-        //if ( (cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame  )
         int Q;
         int i;
         int bpm_target;
+        //int tmp;
+
+        vp8_clear_system_state();
 
         Q = cpi->active_worst_quality;
 
         if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
         {
-            vp8_clear_system_state();
-
             if (cm->frame_type != KEY_FRAME)
             {
-                // Where a gf overlays an existing arf then allow active max Q to drift to highest allowed value.
-                //if ( cpi->common.refresh_golden_frame && cpi->source_alt_ref_active )
-                //cpi->active_worst_quality = cpi->worst_quality;
-
                 if (cpi->avg_frame_qindex < cpi->active_worst_quality)
                     Q = cpi->avg_frame_qindex;
 
-                if (cpi->section_is_low_motion)
-                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * ((Q * 3 / 2) + 128)) / 64;
-                else if (cpi->section_is_fast_motion)
-                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 128)) / 64;
-                else
-                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * ((Q * 5 / 4) + 128)) / 64;
-            }
-            // KEY FRAMES
-            else
-            {
-                if (cpi->section_is_low_motion)
-                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 240)) / 64; // Approx 2.5 to 4.5 where Q has the range 0-127
+               if ( cpi->gfu_boost > 1000 )
+                    cpi->active_best_quality = gf_low_motion_minq[Q];
+                else if ( cpi->gfu_boost < 400 )
+                    cpi->active_best_quality = gf_high_motion_minq[Q];
                 else
-                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 160)) / 64;
-            }
-
-            for (i = Q; i > 0; i--)
-            {
-                if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
-                    break;
-            }
-
-            cpi->active_best_quality = i;
-
-            // this entire section could be replaced by a look up table
-#if 0
-            {
-                int Q, best_q[128];
-
-                for (Q = 0; Q < 128; Q++)
-                {
-                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 160)) / 64; // Approx 2.5 to 4.5 where Q has the range 0-127
-
-                    for (i = Q; i > 0; i--)
-                    {
-                        if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
-                            break;
-                    }
-
-                    best_q[Q] = i;
-                }
-
-                Q += 0;
-            }
-#endif
-
+                    cpi->active_best_quality = gf_mid_motion_minq[Q];
+
+                /*cpi->active_best_quality = gf_arf_minq[Q];
+                tmp = (cpi->gfu_boost > 1000) ? 600 : cpi->gfu_boost - 400;
+                //tmp = (cpi->gfu_boost > 1000) ? 600 :
+                          //(cpi->gfu_boost < 400) ? 0 : cpi->gfu_boost - 400;
+                tmp = 128 - (tmp >> 4);
+                cpi->active_best_quality = (cpi->active_best_quality * tmp)>>7;*/
+
+           }
+           // KEY FRAMES
+           else
+           {
+               if (cpi->gfu_boost > 600)
+                   cpi->active_best_quality = kf_low_motion_minq[Q];
+               else
+                   cpi->active_best_quality = kf_high_motion_minq[Q];
+           }
         }
         else
         {
-            vp8_clear_system_state();
-
-            //bpm_target = (vp8_bits_per_mb[cm->frame_type][Q]*(Q+128))/64; // Approx 2 to 4 where Q has the range 0-127
-            bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 192)) / 128; // Approx * 1.5 to 2.5 where Q has range 0-127
-
-            for (i = Q; i > 0; i--)
-            {
-                if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
-                    break;
-            }
-
-            cpi->active_best_quality = i;
+            cpi->active_best_quality = inter_minq[Q];
         }
 
         // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames
@@ -4134,6 +3954,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
                 // Clear the Alt reference frame active flag when we have a key frame
                 cpi->source_alt_ref_active = FALSE;
 
+                // Reset the loop filter deltas and segmentation map
+                setup_features(cpi);
+
                 // If segmentation is enabled force a map update for key frames
                 if (cpi->mb.e_mbd.segmentation_enabled)
                 {
@@ -4141,12 +3964,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
                     cpi->mb.e_mbd.update_mb_segmentation_data = 1;
                 }
 
-                // If mode or reference frame based loop filter deltas are enabled then force an update for key frames.
-                if (cpi->mb.e_mbd.mode_ref_lf_delta_enabled)
-                {
-                    cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
-                }
-
                 vp8_restore_coding_context(cpi);
 
                 Q = vp8_regulate_q(cpi, cpi->this_frame_target);
@@ -4355,12 +4172,13 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
     if (cm->frame_type == KEY_FRAME)
         cm->refresh_last_frame = 1;
 
-    if (0)
+#if 0
     {
         FILE *f = fopen("gfactive.stt", "a");
         fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
         fclose(f);
     }
+#endif
 
     // For inter frames the current default behaviour is that when cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
     // This is purely an encoder descision at present.
@@ -4604,18 +4422,46 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
         }
     }
 
-#if CONFIG_PSNR
-
-    if (0)
+#if 0 && CONFIG_PSNR
     {
         FILE *f = fopen("tmp.stt", "a");
 
         vp8_clear_system_state();  //__asm emms;
 
         if (cpi->total_coded_error_left != 0.0)
-            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f %10.3f %8ld\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality,  cpi->avg_frame_qindex, cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->est_max_qcorrection_factor, (int)cpi->bits_left, cpi->total_coded_error_left, (double)cpi->bits_left / cpi->total_coded_error_left,  cpi->tot_recode_hits);
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld"
+                       "%6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
+                       "%10.3f %8ld\n",
+                       cpi->common.current_video_frame, cpi->this_frame_target,
+                       cpi->projected_frame_size,
+                       (cpi->projected_frame_size - cpi->this_frame_target),
+                       (int)cpi->total_target_vs_actual,
+                       (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
+                       (int)cpi->total_actual_bits, cm->base_qindex,
+                       cpi->active_best_quality, cpi->active_worst_quality,
+                       cpi->avg_frame_qindex, cpi->zbin_over_quant,
+                       cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+                       cm->frame_type, cpi->gfu_boost,
+                       cpi->est_max_qcorrection_factor, (int)cpi->bits_left,
+                       cpi->total_coded_error_left,
+                       (double)cpi->bits_left / cpi->total_coded_error_left,
+                       cpi->tot_recode_hits);
         else
-            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f %8ld\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality,  cpi->avg_frame_qindex, cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->est_max_qcorrection_factor, (int)cpi->bits_left, cpi->total_coded_error_left,   cpi->tot_recode_hits);
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld"
+                       "%6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
+                       "%8ld\n",
+                       cpi->common.current_video_frame,
+                       cpi->this_frame_target, cpi->projected_frame_size,
+                       (cpi->projected_frame_size - cpi->this_frame_target),
+                       (int)cpi->total_target_vs_actual,
+                       (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
+                       (int)cpi->total_actual_bits, cm->base_qindex,
+                       cpi->active_best_quality, cpi->active_worst_quality,
+                       cpi->avg_frame_qindex, cpi->zbin_over_quant,
+                       cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+                       cm->frame_type, cpi->gfu_boost,
+                       cpi->est_max_qcorrection_factor, (int)cpi->bits_left,
+                       cpi->total_coded_error_left, cpi->tot_recode_hits);
 
         fclose(f);
 
@@ -4623,7 +4469,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
             FILE *fmodes = fopen("Modes.stt", "a");
             int i;
 
-            fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, cm->frame_type, cm->refresh_golden_frame, cm->refresh_alt_ref_frame);
+            fprintf(fmodes, "%6d:%1d:%1d:%1d ",
+                        cpi->common.current_video_frame,
+                        cm->frame_type, cm->refresh_golden_frame,
+                        cm->refresh_alt_ref_frame);
 
             for (i = 0; i < MAX_MODES; i++)
                 fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
@@ -4731,7 +4580,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
 
 
 
-    if (0)
+#if 0
     {
         char filename[512];
         FILE *recon_file;
@@ -4741,6 +4590,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsign
                cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
         fclose(recon_file);
     }
+#endif
 
     // DEBUG
     //vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show);
@@ -4800,8 +4650,6 @@ void vp8_check_gf_quality(VP8_COMP *cpi)
     }
 
 #if 0
-
-    if (0)
     {
         FILE *f = fopen("gfneeded.stt", "a");
         fprintf(f, "%10d %10d %10d %10d %10ld \n",
@@ -4838,10 +4686,10 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
 #if HAVE_ARMV7
 extern void vp8_push_neon(INT64 *store);
 extern void vp8_pop_neon(INT64 *store);
-static INT64 store_reg[8];
 #endif
 int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time)
 {
+    INT64 store_reg[8];
     VP8_COMP *cpi = (VP8_COMP *) ptr;
     VP8_COMMON *cm = &cpi->common;
     struct vpx_usec_timer  timer;
@@ -4850,7 +4698,12 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON
         return -1;
 
 #if HAVE_ARMV7
-    vp8_push_neon(store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_push_neon(store_reg);
+    }
 #endif
 
     vpx_usec_timer_start(&timer);
@@ -4859,7 +4712,12 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON
     if (cpi->source_buffer_count != 0 && cpi->source_buffer_count >= cpi->oxcf.lag_in_frames)
     {
 #if HAVE_ARMV7
-        vp8_pop_neon(store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_pop_neon(store_reg);
+        }
 #endif
         return -1;
     }
@@ -4900,9 +4758,20 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON
         s->source_time_stamp = time_stamp;
         s->source_frame_flags = frame_flags;
 #if HAVE_ARMV7
-        vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer);
-#else
-        vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer);
+        }
+#if CONFIG_RUNTIME_CPU_DETECT
+        else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+        {
+            vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);
+        }
 #endif
         cpi->source_buffer_count = 1;
     }
@@ -4911,14 +4780,19 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON
     cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 
 #if HAVE_ARMV7
-    vp8_pop_neon(store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_pop_neon(store_reg);
+    }
 #endif
 
     return 0;
 }
 int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush)
 {
-
+    INT64 store_reg[8];
     VP8_COMP *cpi = (VP8_COMP *) ptr;
     VP8_COMMON *cm = &cpi->common;
     struct vpx_usec_timer  tsctimer;
@@ -4929,7 +4803,12 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
         return -1;
 
 #if HAVE_ARMV7
-    vp8_push_neon(store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_push_neon(store_reg);
+    }
 #endif
 
     vpx_usec_timer_start(&cmptimer);
@@ -5030,6 +4909,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
             cm->show_frame = 0;
             cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.
             cpi->is_src_frame_alt_ref = 0;
+            cpi->is_next_src_alt_ref = 0;
         }
         else
 #endif
@@ -5048,6 +4928,11 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
                     cpi->is_src_frame_alt_ref = 0;
 
                 cpi->source_encode_index = (cpi->source_encode_index + 1) % cpi->oxcf.lag_in_frames;
+
+                if(cpi->source_encode_index == cpi->last_alt_ref_sei)
+                    cpi->is_next_src_alt_ref = 1;
+                else
+                    cpi->is_next_src_alt_ref = 0;
             }
 
 #endif
@@ -5075,7 +4960,12 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
 #endif
 
 #if HAVE_ARMV7
-        vp8_pop_neon(store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_pop_neon(store_reg);
+        }
 #endif
         return -1;
     }
@@ -5118,7 +5008,12 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
     if (!cpi)
     {
 #if HAVE_ARMV7
-        vp8_pop_neon(store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->rtcd.flags & HAS_NEON)
+#endif
+        {
+            vp8_pop_neon(store_reg);
+        }
 #endif
         return 0;
     }
@@ -5307,7 +5202,12 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
 #endif
 
 #if HAVE_ARMV7
-    vp8_pop_neon(store_reg);
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_pop_neon(store_reg);
+    }
 #endif
 
     return 0;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 211f65912..81e32f031 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -29,7 +29,6 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "mcomp.h"
 
-#define INTRARDOPT
 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
 #define DEFAULT_GF_INTERVAL         7
@@ -230,23 +229,33 @@ typedef struct VP8_ENCODER_RTCD
     vp8_search_rtcd_vtable_t    search;
 } VP8_ENCODER_RTCD;
 
+enum
+{
+    BLOCK_16X8,
+    BLOCK_8X16,
+    BLOCK_8X8,
+    BLOCK_4X4,
+    BLOCK_16X16,
+    BLOCK_MAX_SEGMENTS
+};
+
 typedef struct
 {
 
-    DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
 
-    DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
 
-    DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][4][4]);
-    DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
 
     DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
@@ -274,6 +283,7 @@ typedef struct
 
     int last_alt_ref_sei;
     int is_src_frame_alt_ref;
+    int is_next_src_alt_ref;
 
     int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
     int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
@@ -310,15 +320,12 @@ typedef struct
     int subseqblockweight;
     int errthresh;
 
-#ifdef INTRARDOPT
     int RDMULT;
     int RDDIV ;
 
     TOKENEXTRA *rdtok;
-    int intra_rd_opt;
     vp8_writer rdbc;
     int intra_mode_costs[10];
-#endif
 
 
     CODING_CONTEXT coding_context;
@@ -378,6 +385,7 @@ typedef struct
     int max_gf_interval;
     int baseline_gf_interval;
     int gf_decay_rate;
+    int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
 
     INT64 key_frame_count;
     INT64 tot_key_frame_bits;
@@ -463,14 +471,14 @@ typedef struct
 
     int target_bandwidth;
     long long bits_left;
-    FIRSTPASS_STATS total_stats;
-    FIRSTPASS_STATS this_frame_stats;
+    FIRSTPASS_STATS *total_stats;
+    FIRSTPASS_STATS *this_frame_stats;
     FIRSTPASS_STATS *stats_in, *stats_in_end;
     struct vpx_codec_pkt_list  *output_pkt_list;
     int                          first_pass_done;
     unsigned char *fp_motion_map;
-    FILE *fp_motion_mapfile;
-    int fpmm_pos;
+
+    unsigned char *fp_motion_map_stats, *fp_motion_map_stats_save;
 
 #if 0
     // Experimental code for lagged and one pass
@@ -593,7 +601,7 @@ typedef struct
     fractional_mv_step_fp *find_fractional_mv_step;
     vp8_full_search_fn_t full_search_sad;
     vp8_diamond_search_fn_t diamond_search_sad;
-    vp8_variance_fn_ptr_t fn_ptr;
+    vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
     unsigned int time_receive_data;
     unsigned int time_compress_data;
     unsigned int time_pick_lpf;
@@ -616,9 +624,11 @@ typedef struct
 #endif
 #if VP8_TEMPORAL_ALT_REF
     SOURCE_SAMPLE alt_ref_buffer;
-    unsigned char *frames[MAX_LAG_BUFFERS];
-    int fixed_divide[255];
+    YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+    int fixed_divide[512];
 #endif
+    // Flag to indicate temporal filter method
+    int use_weighted_temporal_filter;
 
 #if CONFIG_PSNR
     int    count;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index eeeddcce9..2f7dd9c7c 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -50,14 +50,13 @@ extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
 extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
 
 
-int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
 {
     (void) b;
     (void) d;
     (void) ref_mv;
     (void) error_per_bit;
-    (void) svf;
-    (void) vf;
+    (void) vfp;
     (void) mvcost;
     bestmv->row <<= 3;
     bestmv->col <<= 3;
@@ -65,7 +64,7 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv,
 }
 
 
-static int get_inter_mbpred_error(MACROBLOCK *mb, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, unsigned int *sse)
+static int get_inter_mbpred_error(MACROBLOCK *mb, const vp8_variance_fn_ptr_t *vfp, unsigned int *sse)
 {
 
     BLOCK *b = &mb->block[0];
@@ -81,20 +80,20 @@ static int get_inter_mbpred_error(MACROBLOCK *mb, vp8_subpixvariance_fn_t svf, v
 
     if (xoffset | yoffset)
     {
-        return svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
+        return vfp->svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
     }
     else
     {
-        return vf(what, what_stride, in_what, in_what_stride, sse);
+        return vfp->vf(what, what_stride, in_what, in_what_stride, sse);
     }
 
 }
 
 unsigned int vp8_get16x16pred_error_c
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_stride,
     int max_sad
 )
@@ -125,9 +124,9 @@ unsigned int vp8_get16x16pred_error_c
 
 unsigned int vp8_get4x4sse_cs_c
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     int max_sad
 )
@@ -719,13 +718,13 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
 
             if (cpi->sf.search_method == HEX)
             {
-                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost);
+                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
             }
             else
             {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
 
@@ -744,7 +743,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
                         num00--;
                     else
                     {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
 
                         if (thissme < bestsme)
                         {
@@ -765,7 +764,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
         }
 
         if (bestsme < INT_MAX)
-            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, cpi->fn_ptr.svf, cpi->fn_ptr.vf, cpi->mb.mvcost);
+            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
 
         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -795,7 +794,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
             x->e_mbd.block[0].bmi.mode = this_mode;
             x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mode_info_context->mbmi.mv.as_int;
 
-            distortion2 = get_inter_mbpred_error(x, cpi->fn_ptr.svf, cpi->fn_ptr.vf, (unsigned int *)(&sse));
+            distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse));
 
             this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
 
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 79e07dbc0..09e8b5412 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -16,6 +16,9 @@
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
 #include "alloccommon.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
 
 extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
 extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val, int sharpness_lvl);
@@ -306,9 +309,20 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 
     //  Make a copy of the unfiltered / processed recon buffer
 #if HAVE_ARMV7
-    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);
-#else
-    vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);
+    }
+#if CONFIG_RUNTIME_CPU_DETECT
+    else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+    {
+        vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+    }
 #endif
 
     if (cm->frame_type == KEY_FRAME)
@@ -343,9 +357,20 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 
     //  Re-instate the unfiltered frame
 #if HAVE_ARMV7
-    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
-#else
-    vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->rtcd.flags & HAS_NEON)
+#endif
+    {
+        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+    }
+#if CONFIG_RUNTIME_CPU_DETECT
+    else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+    {
+        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+    }
 #endif
 
     while (filter_step > 0)
@@ -372,9 +397,20 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 
             //  Re-instate the unfiltered frame
 #if HAVE_ARMV7
-            vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
-#else
-            vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#if CONFIG_RUNTIME_CPU_DETECT
+            if (cm->rtcd.flags & HAS_NEON)
+#endif
+            {
+                vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+            }
+#if CONFIG_RUNTIME_CPU_DETECT
+            else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+            {
+                vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+            }
 #endif
 
             // If value is close to the best so far then bias towards a lower loop filter value.
@@ -401,9 +437,20 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 
             //  Re-instate the unfiltered frame
 #if HAVE_ARMV7
-            vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
-#else
-            vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#if CONFIG_RUNTIME_CPU_DETECT
+            if (cm->rtcd.flags & HAS_NEON)
+#endif
+            {
+                vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+            }
+#if CONFIG_RUNTIME_CPU_DETECT
+            else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+            {
+                vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+            }
 #endif
 
             // Was it better than the previous best?
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 20ec9d11b..5e65fadb3 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -23,14 +23,14 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
     int i, rc, eob;
     int zbin;
     int x, y, z, sz;
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *quant_shift_ptr = &b->quant_shift[0][0];
-    short *qcoeff_ptr = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
+    short *coeff_ptr       = b->coeff;
+    short *zbin_ptr        = b->zbin;
+    short *round_ptr       = b->round;
+    short *quant_ptr       = b->quant;
+    short *quant_shift_ptr = b->quant_shift;
+    short *qcoeff_ptr      = d->qcoeff;
+    short *dqcoeff_ptr     = d->dqcoeff;
+    short *dequant_ptr     = d->dequant;
 
     vpx_memset(qcoeff_ptr, 0, 32);
     vpx_memset(dqcoeff_ptr, 0, 32);
@@ -69,16 +69,16 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
     int i, rc, eob;
     int zbin;
     int x, y, z, sz;
-    short *zbin_boost_ptr = &b->zrun_zbin_boost[0];
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *quant_shift_ptr = &b->quant_shift[0][0];
-    short *qcoeff_ptr = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
-    short zbin_oq_value = b->zbin_extra;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *coeff_ptr       = b->coeff;
+    short *zbin_ptr        = b->zbin;
+    short *round_ptr       = b->round;
+    short *quant_ptr       = b->quant;
+    short *quant_shift_ptr = b->quant_shift;
+    short *qcoeff_ptr      = d->qcoeff;
+    short *dqcoeff_ptr     = d->dqcoeff;
+    short *dequant_ptr     = d->dequant;
+    short zbin_oq_value    = b->zbin_extra;
 
     vpx_memset(qcoeff_ptr, 0, 32);
     vpx_memset(dqcoeff_ptr, 0, 32);
@@ -118,45 +118,95 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 
     d->eob = eob + 1;
 }
+
+/* Perform regular quantization, with unbiased rounding and no zero bin. */
+void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d)
+{
+    int i;
+    int rc;
+    int eob;
+    int x;
+    int y;
+    int z;
+    int sz;
+    short *coeff_ptr;
+    short *quant_ptr;
+    short *quant_shift_ptr;
+    short *qcoeff_ptr;
+    short *dqcoeff_ptr;
+    short *dequant_ptr;
+
+    coeff_ptr       = b->coeff;
+    quant_ptr       = b->quant;
+    quant_shift_ptr = b->quant_shift;
+    qcoeff_ptr      = d->qcoeff;
+    dqcoeff_ptr     = d->dqcoeff;
+    dequant_ptr     = d->dequant;
+    eob = - 1;
+    vpx_memset(qcoeff_ptr, 0, 32);
+    vpx_memset(dqcoeff_ptr, 0, 32);
+    for (i = 0; i < 16; i++)
+    {
+        int dq;
+        int round;
+
+        /*TODO: These arrays should be stored in zig-zag order.*/
+        rc = vp8_default_zig_zag1d[i];
+        z = coeff_ptr[rc];
+        dq = dequant_ptr[rc];
+        round = dq >> 1;
+        /* Sign of z. */
+        sz = -(z < 0);
+        x = (z + sz) ^ sz;
+        x += round;
+        if (x >= dq)
+        {
+            /* Quantize x. */
+            y  = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];
+            /* Put the sign back. */
+            x = (y + sz) ^ sz;
+            /* Save the coefficient and its dequantized value. */
+            qcoeff_ptr[rc] = x;
+            dqcoeff_ptr[rc] = x * dq;
+            /* Remember the last non-zero coefficient. */
+            if (y)
+                eob = i;
+        }
+    }
+
+    d->eob = eob + 1;
+}
+
 #else
 void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
     int i, rc, eob;
     int zbin;
     int x, y, z, sz;
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *qcoeff_ptr = d->qcoeff;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant;
+    short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
-
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
+    short *dequant_ptr = d->dequant;
 
     eob = -1;
-
     for (i = 0; i < 16; i++)
     {
         rc   = vp8_default_zig_zag1d[i];
         z    = coeff_ptr[rc];
-        zbin = zbin_ptr[rc] ;
 
         sz = (z >> 31);                                 // sign of z
         x  = (z ^ sz) - sz;                             // x = abs(z)
 
-        if (x >= zbin)
-        {
-            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
-            x  = (y ^ sz) - sz;                         // get the sign back
-            qcoeff_ptr[rc] = x;                          // write to destination
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+        x  = (y ^ sz) - sz;                         // get the sign back
+        qcoeff_ptr[rc] = x;                          // write to destination
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
 
-            if (y)
-            {
-                eob = i;                                // last nonzero coeffs
-            }
+        if (y)
+        {
+            eob = i;                                // last nonzero coeffs
         }
     }
     d->eob = eob + 1;
@@ -167,15 +217,15 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
     int i, rc, eob;
     int zbin;
     int x, y, z, sz;
-    short *zbin_boost_ptr = &b->zrun_zbin_boost[0];
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *qcoeff_ptr = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
-    short zbin_oq_value = b->zbin_extra;
+    short *zbin_boost_ptr = b->zrun_zbin_boost;
+    short *coeff_ptr      = b->coeff;
+    short *zbin_ptr       = b->zbin;
+    short *round_ptr      = b->round;
+    short *quant_ptr      = b->quant;
+    short *qcoeff_ptr     = d->qcoeff;
+    short *dqcoeff_ptr    = d->dqcoeff;
+    short *dequant_ptr    = d->dequant;
+    short zbin_oq_value   = b->zbin_extra;
 
     vpx_memset(qcoeff_ptr, 0, 32);
     vpx_memset(dqcoeff_ptr, 0, 32);
@@ -216,64 +266,6 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 
 #endif
 
-/* Perform regular quantization, with unbiased rounding and no zero bin. */
-void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d)
-{
-    int i;
-    int rc;
-    int eob;
-    int x;
-    int y;
-    int z;
-    int sz;
-    short *coeff_ptr;
-    short *quant_ptr;
-    short *quant_shift_ptr;
-    short *qcoeff_ptr;
-    short *dqcoeff_ptr;
-    short *dequant_ptr;
-
-    coeff_ptr = &b->coeff[0];
-    quant_ptr = &b->quant[0][0];
-    quant_shift_ptr = &b->quant_shift[0][0];
-    qcoeff_ptr = d->qcoeff;
-    dqcoeff_ptr = d->dqcoeff;
-    dequant_ptr = &d->dequant[0][0];
-    eob = - 1;
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
-    for (i = 0; i < 16; i++)
-    {
-        int dq;
-        int round;
-
-        /*TODO: These arrays should be stored in zig-zag order.*/
-        rc = vp8_default_zig_zag1d[i];
-        z = coeff_ptr[rc];
-        dq = dequant_ptr[rc];
-        round = dq >> 1;
-        /* Sign of z. */
-        sz = -(z < 0);
-        x = (z + sz) ^ sz;
-        x += round;
-        if (x >= dq)
-        {
-            /* Quantize x. */
-            y  = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];
-            /* Put the sign back. */
-            x = (y + sz) ^ sz;
-            /* Save the coefficient and its dequantized value. */
-            qcoeff_ptr[rc] = x;
-            dqcoeff_ptr[rc] = x * dq;
-            /* Remember the last non-zero coefficient. */
-            if (y)
-                eob = i;
-        }
-    }
-
-    d->eob = eob + 1;
-}
-
 void vp8_quantize_mby(MACROBLOCK *x)
 {
     int i;
@@ -281,17 +273,10 @@ void vp8_quantize_mby(MACROBLOCK *x)
         && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
 
     for (i = 0; i < 16; i++)
-    {
         x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
-        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &=
-            (x->e_mbd.block[i].eob <= has_2nd_order);
-    }
 
     if(has_2nd_order)
-    {
         x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &= (!x->e_mbd.block[24].eob);
-    }
 }
 
 void vp8_quantize_mb(MACROBLOCK *x)
@@ -300,13 +285,8 @@ void vp8_quantize_mb(MACROBLOCK *x)
     int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
         && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
 
-    x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = 1;
     for (i = 0; i < 24+has_2nd_order; i++)
-    {
         x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
-        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &=
-            (x->e_mbd.block[i].eob <= (has_2nd_order && i<16));
-    }
 }
 
 
@@ -315,8 +295,5 @@ void vp8_quantize_mbuv(MACROBLOCK *x)
     int i;
 
     for (i = 16; i < 24; i++)
-    {
         x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
-        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
-    }
 }
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 50f4db0b8..dd324f435 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1481,6 +1481,8 @@ void vp8_adjust_key_frame_context(VP8_COMP *cpi)
         // allocated than those following other gfs.
         cpi->kf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 7 / 8;
         cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 1 / 8;
+        if(!av_key_frame_frequency)
+            av_key_frame_frequency = 60;
 
         // Work out how much to try and recover per frame.
         // For one pass we estimate the number of frames to spread it over based upon past history.
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index dbef85b9f..8a753fd44 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -918,21 +918,6 @@ void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv)
 }
 
 #if !(CONFIG_REALTIME_ONLY)
-int vp8_count_labels(int const *labelings)
-{
-    int i;
-    int count = 0;
-
-    for (i = 0; i < 16; i++)
-    {
-        if (labelings[i] > count)
-            count = labelings[i];
-    }
-
-    return count + 1;
-}
-
-
 static int labels2mode(
     MACROBLOCK *x,
     int const *labelings, int which_label,
@@ -1112,15 +1097,19 @@ static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion, const vp
     *Rate = vp8_rdcost_mby(mb);
 }
 
+unsigned char vp8_mbsplit_offset2[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
 static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *mdcounts, int *returntotrate, int *returnyrate, int *returndistortion, int compressor_speed, int *mvcost[2], int mvthresh, int fullpixel)
 {
     int i, segmentation;
     B_PREDICTION_MODE this_mode;
     MACROBLOCKD *xc = &x->e_mbd;
-    BLOCK *b = &x->block[0];
-    BLOCKD *d = &x->e_mbd.block[0];
-    BLOCK *c = &x->block[0];
-    BLOCKD *e = &x->e_mbd.block[0];
+    BLOCK *c;
+    BLOCKD *e;
     int const *labels;
     int best_segment_rd = INT_MAX;
     int best_seg = 0;
@@ -1130,6 +1119,8 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
     int bsd = 0;
     int bestsegmentyrate = 0;
 
+    static const int segmentation_to_sseshift[4] = {3, 3, 2, 0};
+
     // FIX TO Rd error outrange bug PGW 9 june 2004
     B_PREDICTION_MODE bmodes[16] = {ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
                                     ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
@@ -1151,10 +1142,10 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
         int rate = 0;
         int sbr = 0;
         int sbd = 0;
-        int UNINITIALIZED_IS_SAFE(sseshift);
+        int sseshift;
         int segmentyrate = 0;
 
-        vp8_variance_fn_ptr_t v_fn_ptr;
+        vp8_variance_fn_ptr_t *v_fn_ptr;
 
         ENTROPY_CONTEXT_PLANES t_above, t_left;
         ENTROPY_CONTEXT *ta;
@@ -1174,44 +1165,10 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
         br = 0;
         bd = 0;
 
-        switch (segmentation)
-        {
-        case 0:
-            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x8);
-            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x8);
-            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
-            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
-            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
-            sseshift = 3;
-            break;
-        case 1:
-            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x16);
-            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x16);
-            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
-            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
-            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
-            sseshift = 3;
-            break;
-        case 2:
-            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x8);
-            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x8);
-            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
-            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
-            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
-            sseshift = 2;
-            break;
-        case 3:
-            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var4x4);
-            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar4x4);
-            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
-            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
-            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
-            sseshift = 0;
-            break;
-        }
-
+        v_fn_ptr = &cpi->fn_ptr[segmentation];
+        sseshift = segmentation_to_sseshift[segmentation];
         labels = vp8_mbsplits[segmentation];
-        label_count = vp8_count_labels(labels);
+        label_count = vp8_mbsplit_count[segmentation];
 
         // 64 makes this threshold really big effectively
         // making it so that we very rarely check mvs on
@@ -1235,14 +1192,9 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
             int j;
             int bestlabelyrate = 0;
 
-            b = &x->block[0];
-            d = &x->e_mbd.block[0];
-
 
             // find first label
-            for (j = 0; j < 16; j++)
-                if (labels[j] == i)
-                    break;
+            j = vp8_mbsplit_offset2[segmentation][i];
 
             c = &x->block[j];
             e = &x->e_mbd.block[j];
@@ -1281,10 +1233,10 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
                         int sadpb = x->sadperbit4;
 
                         if (cpi->sf.search_method == HEX)
-                            bestsme = vp8_hex_search(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb/*x->errorperbit*/, &num00, v_fn_ptr.vf, v_fn_ptr.sdf, x->mvsadcost, mvcost);
+                            bestsme = vp8_hex_search(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
                         else
                         {
-                            bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost);
+                            bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
 
                             n = num00;
                             num00 = 0;
@@ -1297,7 +1249,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
                                     num00--;
                                 else
                                 {
-                                    thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost);
+                                    thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
 
                                     if (thissme < bestsme)
                                     {
@@ -1312,7 +1264,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
                         // Should we do a full search (best quality only)
                         if ((compressor_speed == 0) && (bestsme >> sseshift) > 4000)
                         {
-                            thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, &v_fn_ptr, x->mvcost, x->mvsadcost);
+                            thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, v_fn_ptr, x->mvcost, x->mvsadcost);
 
                             if (thissme < bestsme)
                             {
@@ -1330,9 +1282,9 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
                     if (bestsme < INT_MAX)
                     {
                         if (!fullpixel)
-                            cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit / 2, v_fn_ptr.svf, v_fn_ptr.vf, mvcost);
+                            cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit / 2, v_fn_ptr, mvcost);
                         else
-                            vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit, v_fn_ptr.svf, v_fn_ptr.vf, mvcost);
+                            vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit, v_fn_ptr, mvcost);
                     }
                 }
 
@@ -1410,46 +1362,20 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *bes
         bd->eob = beobs[i];
     }
 
-    // Trap cases where the best split mode has all vectors coded 0,0 (or all the same)
-    if (FALSE)
-    {
-        int allsame = 1;
-
-        for (i = 1; i < 16; i++)
-        {
-            if ((bmvs[i].col != bmvs[i-1].col) || (bmvs[i].row != bmvs[i-1].row))
-            {
-                allsame = 0;
-                break;
-            }
-        }
-
-        if (allsame)
-        {
-            best_segment_rd = INT_MAX;
-        }
-    }
-
     *returntotrate = bsr;
     *returndistortion = bsd;
     *returnyrate = bestsegmentyrate;
 
-
-
     // save partitions
     labels = vp8_mbsplits[best_seg];
     x->e_mbd.mode_info_context->mbmi.partitioning = best_seg;
-    x->partition_info->count = vp8_count_labels(labels);
+    x->partition_info->count = vp8_mbsplit_count[best_seg];
 
     for (i = 0; i < x->partition_info->count; i++)
     {
         int j;
 
-        for (j = 0; j < 16; j++)
-        {
-            if (labels[j] == i)
-                break;
-        }
+        j = vp8_mbsplit_offset2[best_seg][i];
 
         x->partition_info->bmi[i].mode = x->e_mbd.block[j].bmi.mode;
         x->partition_info->bmi[i].mv.as_mv = x->e_mbd.block[j].bmi.mv.as_mv;
@@ -1852,13 +1778,13 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
                     if (cpi->sf.search_method == HEX)
                     {
-                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost);
+                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
                         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
                     }
                     else
                     {
-                        bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9
+                        bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
                         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
 
@@ -1877,7 +1803,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                                 num00--;
                             else
                             {
-                                thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9
+                                thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
 
                                 if (thissme < bestsme)
                                 {
@@ -1914,7 +1840,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                     search_range = (search_range > cpi->sf.max_fs_radius) ? cpi->sf.max_fs_radius : search_range;
                     {
                         int sadpb = x->sadperbit16 >> 2;
-                        thissme = cpi->full_search_sad(x, b, d, &best_ref_mv, sadpb, search_range, &cpi->fn_ptr, x->mvcost, x->mvsadcost);
+                        thissme = cpi->full_search_sad(x, b, d, &best_ref_mv, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, x->mvsadcost);
                     }
 
                     // Barrier threshold to initiating full search
@@ -1939,7 +1865,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
                 if (bestsme < INT_MAX)
                     // cpi->find_fractional_mv_step(x,b,d,&d->bmi.mv.as_mv,&best_ref_mv,x->errorperbit/2,cpi->fn_ptr.svf,cpi->fn_ptr.vf,x->mvcost);  // normal mvc=11
-                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, cpi->fn_ptr.svf, cpi->fn_ptr.vf, x->mvcost);
+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost);
 
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
index e63be2bda..5eaca5935 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -12,9 +12,9 @@
 #include <stdlib.h>
 
 unsigned int vp8_sad16x16_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     int max_sad)
 {
@@ -39,9 +39,9 @@ unsigned int vp8_sad16x16_c(
 
 static __inline
 unsigned int sad_mx_n_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     int m,
     int n)
@@ -66,9 +66,9 @@ unsigned int sad_mx_n_c(
 
 
 unsigned int vp8_sad8x8_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     int max_sad)
 {
@@ -78,9 +78,9 @@ unsigned int vp8_sad8x8_c(
 
 
 unsigned int vp8_sad16x8_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     int max_sad)
 {
@@ -91,9 +91,9 @@ unsigned int vp8_sad16x8_c(
 
 
 unsigned int vp8_sad8x16_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     int max_sad)
 {
@@ -103,9 +103,9 @@ unsigned int vp8_sad8x16_c(
 
 
 unsigned int vp8_sad4x4_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     int max_sad)
 {
@@ -114,9 +114,9 @@ unsigned int vp8_sad4x4_c(
 }
 
 void vp8_sad16x16x3_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     unsigned int *sad_array
 )
@@ -126,10 +126,28 @@ void vp8_sad16x16x3_c(
     sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
+void vp8_sad16x16x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x8x3_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     unsigned int *sad_array
 )
@@ -139,10 +157,28 @@ void vp8_sad16x8x3_c(
     sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
+void vp8_sad16x8x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad8x8x3_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     unsigned int *sad_array
 )
@@ -152,10 +188,28 @@ void vp8_sad8x8x3_c(
     sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
+void vp8_sad8x8x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad8x16x3_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     unsigned int *sad_array
 )
@@ -165,10 +219,28 @@ void vp8_sad8x16x3_c(
     sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
+void vp8_sad8x16x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad4x4x3_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  ref_stride,
     unsigned int *sad_array
 )
@@ -178,8 +250,26 @@ void vp8_sad4x4x3_c(
     sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
+void vp8_sad4x4x8_c(
+    const unsigned char *src_ptr,
+    int  src_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned short *sad_array
+)
+{
+    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
+    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
+    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x4d_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
     unsigned char *ref_ptr[],
     int  ref_stride,
@@ -193,7 +283,7 @@ void vp8_sad16x16x4d_c(
 }
 
 void vp8_sad16x8x4d_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
     unsigned char *ref_ptr[],
     int  ref_stride,
@@ -207,7 +297,7 @@ void vp8_sad16x8x4d_c(
 }
 
 void vp8_sad8x8x4d_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
     unsigned char *ref_ptr[],
     int  ref_stride,
@@ -221,7 +311,7 @@ void vp8_sad8x8x4d_c(
 }
 
 void vp8_sad8x16x4d_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
     unsigned char *ref_ptr[],
     int  ref_stride,
@@ -235,7 +325,7 @@ void vp8_sad8x16x4d_c(
 }
 
 void vp8_sad4x4x4d_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  src_stride,
     unsigned char *ref_ptr[],
     int  ref_stride,
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
new file mode 100644
index 000000000..fd5dd7ede
--- /dev/null
+++ b/vp8/encoder/temporal_filter.c
@@ -0,0 +1,651 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "onyx_int.h"
+#include "systemdependent.h"
+#include "quantize.h"
+#include "alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "extend.h"
+#include "ratectrl.h"
+#include "quant_common.h"
+#include "segmentation.h"
+#include "g_common.h"
+#include "vpx_scale/yv12extend.h"
+#include "postproc.h"
+#include "vpx_mem/vpx_mem.h"
+#include "swapyv12buffer.h"
+#include "threading.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpxerrors.h"
+
+#include <math.h>
+#include <limits.h>
+
+#define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
+#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+
+#define USE_FILTER_LUT 1
+#if VP8_TEMPORAL_ALT_REF
+
+#if USE_FILTER_LUT
+static int modifier_lut[7][19] =
+{
+    // Strength=0
+    {16, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    // Strength=1
+    {16, 15, 10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    // Strength=2
+    {16, 15, 13, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    // Strength=3
+    {16, 16, 15, 13, 10, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    // Strength=4
+    {16, 16, 15, 14, 13, 11, 9, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    // Strength=5
+    {16, 16, 16, 15, 15, 14, 13, 11, 10, 8, 7, 5, 3, 0, 0, 0, 0, 0, 0},
+    // Strength=6
+    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1}
+};
+#endif
+static void build_predictors_mb
+(
+    MACROBLOCKD *x,
+    unsigned char *y_mb_ptr,
+    unsigned char *u_mb_ptr,
+    unsigned char *v_mb_ptr,
+    int stride,
+    int mv_row,
+    int mv_col,
+    unsigned char *pred
+)
+{
+    int offset;
+    unsigned char *yptr, *uptr, *vptr;
+
+    // Y
+    yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
+
+    if ((mv_row | mv_col) & 7)
+    {
+//        vp8_sixtap_predict16x16_c(yptr, stride,
+//                                    mv_col & 7, mv_row & 7, &pred[0], 16);
+        x->subpixel_predict16x16(yptr, stride,
+                                    mv_col & 7, mv_row & 7, &pred[0], 16);
+    }
+    else
+    {
+        //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);
+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);
+    }
+
+    // U & V
+    mv_row >>= 1;
+    mv_col >>= 1;
+    stride >>= 1;
+    offset = (mv_row >> 3) * stride + (mv_col >> 3);
+    uptr = u_mb_ptr + offset;
+    vptr = v_mb_ptr + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict8x8(uptr, stride,
+                            mv_col & 7, mv_row & 7, &pred[256], 8);
+        x->subpixel_predict8x8(vptr, stride,
+                            mv_col & 7, mv_row & 7, &pred[320], 8);
+    }
+    else
+    {
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, stride, &pred[256], 8);
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, stride, &pred[320], 8);
+    }
+}
+static void apply_temporal_filter
+(
+    unsigned char *frame1,
+    unsigned int stride,
+    unsigned char *frame2,
+    unsigned int block_size,
+    int strength,
+    int filter_weight,
+    unsigned int *accumulator,
+    unsigned int *count
+)
+{
+    int i, j, k;
+    int modifier;
+    int byte = 0;
+
+#if USE_FILTER_LUT
+    int *lut = modifier_lut[strength];
+#endif
+
+    for (i = 0,k = 0; i < block_size; i++)
+    {
+        for (j = 0; j < block_size; j++, k++)
+        {
+
+            int src_byte = frame1[byte];
+            int pixel_value = *frame2++;
+
+#if USE_FILTER_LUT
+            // LUT implementation --
+            // improves precision of filter
+            modifier = abs(src_byte-pixel_value);
+            modifier = modifier>18 ? 0 : lut[modifier];
+#else
+            modifier   = src_byte;
+            modifier  -= pixel_value;
+            modifier  *= modifier;
+            modifier >>= strength;
+            modifier  *= 3;
+
+            if (modifier > 16)
+                modifier = 16;
+
+            modifier = 16 - modifier;
+#endif
+            modifier *= filter_weight;
+
+            count[k] += modifier;
+            accumulator[k] += modifier * pixel_value;
+
+            byte++;
+        }
+
+        byte += stride - block_size;
+    }
+}
+
+#if ALT_REF_MC_ENABLED
+static int dummy_cost[2*mv_max+1];
+
+static int find_matching_mb
+(
+    VP8_COMP *cpi,
+    YV12_BUFFER_CONFIG *arf_frame,
+    YV12_BUFFER_CONFIG *frame_ptr,
+    int mb_offset,
+    int error_thresh
+)
+{
+    MACROBLOCK *x = &cpi->mb;
+    int thissme;
+    int step_param;
+    int further_steps;
+    int n = 0;
+    int sadpb = x->sadperbit16;
+    int bestsme = INT_MAX;
+    int num00 = 0;
+
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    MV best_ref_mv1 = {0,0};
+
+    int *mvcost[2]    = { &dummy_cost[mv_max+1], &dummy_cost[mv_max+1] };
+    int *mvsadcost[2] = { &dummy_cost[mv_max+1], &dummy_cost[mv_max+1] };
+
+    // Save input state
+    unsigned char **base_src = b->base_src;
+    int src = b->src;
+    int src_stride = b->src_stride;
+    unsigned char **base_pre = d->base_pre;
+    int pre = d->pre;
+    int pre_stride = d->pre_stride;
+
+    // Setup frame pointers
+    b->base_src = &arf_frame->y_buffer;
+    b->src_stride = arf_frame->y_stride;
+    b->src = mb_offset;
+
+    d->base_pre = &frame_ptr->y_buffer;
+    d->pre_stride = frame_ptr->y_stride;
+    d->pre = mb_offset;
+
+    // Further step/diamond searches as necessary
+    if (cpi->Speed < 8)
+    {
+        step_param = cpi->sf.first_step +
+                    ((cpi->Speed > 5) ? 1 : 0);
+        further_steps =
+            (cpi->sf.max_step_search_steps - 1)-step_param;
+    }
+    else
+    {
+        step_param = cpi->sf.first_step + 2;
+        further_steps = 0;
+    }
+
+    if (1/*cpi->sf.search_method == HEX*/)
+    {
+        // TODO Check that the 16x16 vf & sdf are selected here
+        bestsme = vp8_hex_search(x, b, d,
+            &best_ref_mv1, &d->bmi.mv.as_mv,
+            step_param,
+            sadpb/*x->errorperbit*/,
+            &num00, &cpi->fn_ptr[BLOCK_16X16],
+            mvsadcost, mvcost);
+    }
+    else
+    {
+        int mv_x, mv_y;
+
+        bestsme = cpi->diamond_search_sad(x, b, d,
+            &best_ref_mv1, &d->bmi.mv.as_mv,
+            step_param,
+            sadpb / 2/*x->errorperbit*/,
+            &num00, &cpi->fn_ptr[BLOCK_16X16],
+            mvsadcost, mvcost); //sadpb < 9
+
+        // Further step/diamond searches as necessary
+        n = 0;
+        //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+        n = num00;
+        num00 = 0;
+
+        while (n < further_steps)
+        {
+            n++;
+
+            if (num00)
+                num00--;
+            else
+            {
+                thissme = cpi->diamond_search_sad(x, b, d,
+                    &best_ref_mv1, &d->bmi.mv.as_mv,
+                    step_param + n,
+                    sadpb / 4/*x->errorperbit*/,
+                    &num00, &cpi->fn_ptr[BLOCK_16X16],
+                    mvsadcost, mvcost); //sadpb = 9
+
+                if (thissme < bestsme)
+                {
+                    bestsme = thissme;
+                    mv_y = d->bmi.mv.as_mv.row;
+                    mv_x = d->bmi.mv.as_mv.col;
+                }
+                else
+                {
+                    d->bmi.mv.as_mv.row = mv_y;
+                    d->bmi.mv.as_mv.col = mv_x;
+                }
+            }
+        }
+    }
+
+#if ALT_REF_SUBPEL_ENABLED
+    // Try sub-pixel MC?
+    //if (bestsme > error_thresh && bestsme < INT_MAX)
+    {
+        bestsme = cpi->find_fractional_mv_step(x, b, d,
+                    &d->bmi.mv.as_mv, &best_ref_mv1,
+                    x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],
+                    cpi->mb.mvcost);
+    }
+#endif
+
+    // Save input state
+    b->base_src = base_src;
+    b->src = src;
+    b->src_stride = src_stride;
+    d->base_pre = base_pre;
+    d->pre = pre;
+    d->pre_stride = pre_stride;
+
+    return bestsme;
+}
+#endif
+
+static void vp8cx_temp_blur1_c
+(
+    VP8_COMP *cpi,
+    int frame_count,
+    int alt_ref_index,
+    int strength
+)
+{
+    int byte;
+    int frame;
+    int mb_col, mb_row;
+    unsigned int filter_weight[MAX_LAG_BUFFERS];
+    unsigned char *mm_ptr = cpi->fp_motion_map;
+    int cols = cpi->common.mb_cols;
+    int rows = cpi->common.mb_rows;
+    int MBs  = cpi->common.MBs;
+    int mb_y_offset = 0;
+    int mb_uv_offset = 0;
+    unsigned int accumulator[384];
+    unsigned int count[384];
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+    unsigned char *dst1, *dst2;
+    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+
+    // Save input state
+    unsigned char *y_buffer = mbd->pre.y_buffer;
+    unsigned char *u_buffer = mbd->pre.u_buffer;
+    unsigned char *v_buffer = mbd->pre.v_buffer;
+
+    if (!cpi->use_weighted_temporal_filter)
+    {
+        // Temporal filtering is unweighted
+        for (frame = 0; frame < frame_count; frame++)
+            filter_weight[frame] = 1;
+    }
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+#if ALT_REF_MC_ENABLED
+        // Reduced search extent by 3 for 6-tap filter & smaller UMV border
+        cpi->mb.mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 19));
+        cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+                                + (VP8BORDERINPIXELS - 19);
+#endif
+
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+            int i, j, k, w;
+            int weight_cap;
+            int stride;
+
+            vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
+            vpx_memset(count, 0, 384*sizeof(unsigned int));
+
+#if ALT_REF_MC_ENABLED
+            // Reduced search extent by 3 for 6-tap filter & smaller UMV border
+            cpi->mb.mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 19));
+            cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+                                    + (VP8BORDERINPIXELS - 19);
+#endif
+
+            // Read & process macroblock weights from motion map
+            if (cpi->use_weighted_temporal_filter)
+            {
+                weight_cap = 2;
+
+                for (frame = alt_ref_index-1; frame >= 0; frame--)
+                {
+                    w = *(mm_ptr + (frame+1)*MBs);
+                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
+                    weight_cap = w;
+                }
+
+                filter_weight[alt_ref_index] = 2;
+
+                weight_cap = 2;
+
+                for (frame = alt_ref_index+1; frame < frame_count; frame++)
+                {
+                    w = *(mm_ptr + frame*MBs);
+                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
+                    weight_cap = w;
+                }
+
+            }
+
+            for (frame = 0; frame < frame_count; frame++)
+            {
+                int err;
+
+                if (cpi->frames[frame] == NULL)
+                    continue;
+
+                mbd->block[0].bmi.mv.as_mv.row = 0;
+                mbd->block[0].bmi.mv.as_mv.col = 0;
+
+#if ALT_REF_MC_ENABLED
+                //if (filter_weight[frame] == 0)
+                {
+#define THRESH_LOW   10000
+#define THRESH_HIGH  20000
+
+                    // Correlation has been lost try MC
+                    err = find_matching_mb ( cpi,
+                                             cpi->frames[alt_ref_index],
+                                             cpi->frames[frame],
+                                             mb_y_offset,
+                                             THRESH_LOW );
+
+                    if (filter_weight[frame] < 2)
+                    {
+                        // Set weight depending on error
+                        filter_weight[frame] = err<THRESH_LOW
+                                                ? 2 : err<THRESH_HIGH ? 1 : 0;
+                    }
+                }
+#endif
+                if (filter_weight[frame] != 0)
+                {
+                    // Construct the predictors
+                    build_predictors_mb (
+                              mbd,
+                              cpi->frames[frame]->y_buffer + mb_y_offset,
+                              cpi->frames[frame]->u_buffer + mb_uv_offset,
+                              cpi->frames[frame]->v_buffer + mb_uv_offset,
+                              cpi->frames[frame]->y_stride,
+                              mbd->block[0].bmi.mv.as_mv.row,
+                              mbd->block[0].bmi.mv.as_mv.col,
+                              predictor );
+
+                    // Apply the filter (YUV)
+                    apply_temporal_filter ( f->y_buffer + mb_y_offset,
+                                            f->y_stride,
+                                            predictor,
+                                            16,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator,
+                                            count );
+
+                    apply_temporal_filter ( f->u_buffer + mb_uv_offset,
+                                            f->uv_stride,
+                                            predictor + 256,
+                                            8,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator + 256,
+                                            count + 256 );
+
+                    apply_temporal_filter ( f->v_buffer + mb_uv_offset,
+                                            f->uv_stride,
+                                            predictor + 320,
+                                            8,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator + 320,
+                                            count + 320 );
+                }
+            }
+
+            // Normalize filter output to produce AltRef frame
+            dst1 = cpi->alt_ref_buffer.source_buffer.y_buffer;
+            stride = cpi->alt_ref_buffer.source_buffer.y_stride;
+            byte = mb_y_offset;
+            for (i = 0,k = 0; i < 16; i++)
+            {
+                for (j = 0; j < 16; j++, k++)
+                {
+                    unsigned int pval = accumulator[k] + (count[k] >> 1);
+                    pval *= cpi->fixed_divide[count[k]];
+                    pval >>= 19;
+
+                    dst1[byte] = (unsigned char)pval;
+
+                    // move to next pixel
+                    byte++;
+                }
+
+                byte += stride - 16;
+            }
+
+            dst1 = cpi->alt_ref_buffer.source_buffer.u_buffer;
+            dst2 = cpi->alt_ref_buffer.source_buffer.v_buffer;
+            stride = cpi->alt_ref_buffer.source_buffer.uv_stride;
+            byte = mb_uv_offset;
+            for (i = 0,k = 256; i < 8; i++)
+            {
+                for (j = 0; j < 8; j++, k++)
+                {
+                    int m=k+64;
+
+                    // U
+                    unsigned int pval = accumulator[k] + (count[k] >> 1);
+                    pval *= cpi->fixed_divide[count[k]];
+                    pval >>= 19;
+                    dst1[byte] = (unsigned char)pval;
+
+                    // V
+                    pval = accumulator[m] + (count[m] >> 1);
+                    pval *= cpi->fixed_divide[count[m]];
+                    pval >>= 19;
+                    dst2[byte] = (unsigned char)pval;
+
+                    // move to next pixel
+                    byte++;
+                }
+
+                byte += stride - 8;
+            }
+
+            mm_ptr++;
+            mb_y_offset += 16;
+            mb_uv_offset += 8;
+        }
+
+        mb_y_offset += 16*f->y_stride-f->y_width;
+        mb_uv_offset += 8*f->uv_stride-f->uv_width;
+    }
+
+    // Restore input state
+    mbd->pre.y_buffer = y_buffer;
+    mbd->pre.u_buffer = u_buffer;
+    mbd->pre.v_buffer = v_buffer;
+}
+
+void vp8cx_temp_filter_c
+(
+    VP8_COMP *cpi
+)
+{
+    int frame = 0;
+
+    int num_frames_backward = 0;
+    int num_frames_forward = 0;
+    int frames_to_blur_backward = 0;
+    int frames_to_blur_forward = 0;
+    int frames_to_blur = 0;
+    int start_frame = 0;
+    unsigned int filtered = 0;
+
+    int strength = cpi->oxcf.arnr_strength;
+
+    int blur_type = cpi->oxcf.arnr_type;
+
+    int max_frames = cpi->active_arnr_frames;
+
+    num_frames_backward = cpi->last_alt_ref_sei - cpi->source_encode_index;
+
+    if (num_frames_backward < 0)
+        num_frames_backward += cpi->oxcf.lag_in_frames;
+
+    num_frames_forward = cpi->oxcf.lag_in_frames - (num_frames_backward + 1);
+
+    switch (blur_type)
+    {
+    case 1:
+        /////////////////////////////////////////
+        // Backward Blur
+
+        frames_to_blur_backward = num_frames_backward;
+
+        if (frames_to_blur_backward >= max_frames)
+            frames_to_blur_backward = max_frames - 1;
+
+        frames_to_blur = frames_to_blur_backward + 1;
+        break;
+
+    case 2:
+        /////////////////////////////////////////
+        // Forward Blur
+
+        frames_to_blur_forward = num_frames_forward;
+
+        if (frames_to_blur_forward >= max_frames)
+            frames_to_blur_forward = max_frames - 1;
+
+        frames_to_blur = frames_to_blur_forward + 1;
+        break;
+
+    case 3:
+    default:
+        /////////////////////////////////////////
+        // Center Blur
+        frames_to_blur_forward = num_frames_forward;
+        frames_to_blur_backward = num_frames_backward;
+
+        if (frames_to_blur_forward > frames_to_blur_backward)
+            frames_to_blur_forward = frames_to_blur_backward;
+
+        if (frames_to_blur_backward > frames_to_blur_forward)
+            frames_to_blur_backward = frames_to_blur_forward;
+
+        // When max_frames is even we have 1 more frame backward than forward
+        if (frames_to_blur_forward > (max_frames - 1) / 2)
+            frames_to_blur_forward = ((max_frames - 1) / 2);
+
+        if (frames_to_blur_backward > (max_frames / 2))
+            frames_to_blur_backward = (max_frames / 2);
+
+        frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+        break;
+    }
+
+    start_frame = (cpi->last_alt_ref_sei
+                    + frames_to_blur_forward) % cpi->oxcf.lag_in_frames;
+
+#ifdef DEBUGFWG
+    // DEBUG FWG
+    printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
+           , max_frames
+           , num_frames_backward
+           , num_frames_forward
+           , frames_to_blur
+           , frames_to_blur_backward
+           , frames_to_blur_forward
+           , cpi->source_encode_index
+           , cpi->last_alt_ref_sei
+           , start_frame);
+#endif
+
+    // Setup frame pointers, NULL indicates frame not included in filter
+    vpx_memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
+    for (frame = 0; frame < frames_to_blur; frame++)
+    {
+        int which_buffer =  start_frame - frame;
+
+        if (which_buffer < 0)
+            which_buffer += cpi->oxcf.lag_in_frames;
+
+        cpi->frames[frames_to_blur-1-frame]
+                = &cpi->src_buffer[which_buffer].source_buffer;
+    }
+
+    vp8cx_temp_blur1_c (
+        cpi,
+        frames_to_blur,
+        frames_to_blur_backward,
+        strength );
+}
+#endif
diff --git a/vp8/encoder/temporal_filter.h b/vp8/encoder/temporal_filter.h
new file mode 100644
index 000000000..f70e8c01e
--- /dev/null
+++ b/vp8/encoder/temporal_filter.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_TEMPORAL_FILTER_H
+#define __INC_VP8_TEMPORAL_FILTER_H
+
+#include "onyx_int.h"
+
+void vp8cx_temp_filter_c(VP8_COMP *cpi);
+
+#endif // __INC_VP8_TEMPORAL_FILTER_H
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index d9b8d36fd..e4da83379 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -26,8 +26,8 @@ _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef
 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
 void vp8_fix_contexts(MACROBLOCKD *x);
 
-TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2];
-const TOKENEXTRA *vp8_dct_value_tokens_ptr;
+TOKENVALUE vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+const TOKENVALUE *vp8_dct_value_tokens_ptr;
 int vp8_dct_value_cost[DCT_MAX_VALUE*2];
 const int *vp8_dct_value_cost_ptr;
 #if 0
@@ -37,7 +37,7 @@ int skip_false_count = 0;
 static void fill_value_tokens()
 {
 
-    TOKENEXTRA *const t = vp8_dct_value_tokens + DCT_MAX_VALUE;
+    TOKENVALUE *const t = vp8_dct_value_tokens + DCT_MAX_VALUE;
     vp8_extra_bit_struct *const e = vp8_extra_bits;
 
     int i = -DCT_MAX_VALUE;
@@ -198,6 +198,28 @@ static void tokenize1st_order_b
 
 }
 
+
+static int mb_is_skippable(MACROBLOCKD *x)
+{
+    int has_y2_block;
+    int skip = 1;
+    int i = 0;
+
+    has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED
+                    && x->mode_info_context->mbmi.mode != SPLITMV);
+    if (has_y2_block)
+    {
+        for (i = 0; i < 16; i++)
+            skip &= (x->block[i].eob < 2);
+    }
+
+    for (; i < 24 + has_y2_block; i++)
+        skip &= (!x->block[i].eob);
+
+    return skip;
+}
+
+
 void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
 {
     ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
@@ -223,6 +245,7 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
 
 #if 1
 
+    x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x);
     if (x->mode_info_context->mbmi.mb_skip_coeff)
     {
 
@@ -247,35 +270,6 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
     cpi->skip_false_count++;
 #endif
 #if 0
-
-    if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV)
-    {
-        int i, skip = 1;
-
-        for (i = 0; i < 24; i++)
-            skip &= (!x->block[i].eob);
-
-        if (skip != x->mbmi.mb_skip_coeff)
-            skip += 0;
-
-        x->mbmi.mb_skip_coeff = skip;
-    }
-    else
-    {
-        int i, skip = 1;
-
-        for (i = 0; i < 16; i++)
-            skip &= (x->block[i].eob < 2);
-
-        for (i = 16; i < 25; i++)
-            skip &= (!x->block[i].eob);
-
-        if (skip != x->mbmi.mb_skip_coeff)
-            skip += 0;
-
-        x->mbmi.mb_skip_coeff = skip;
-    }
-
     vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts));
 #endif
 
diff --git a/vp8/encoder/tokenize.h b/vp8/encoder/tokenize.h
index 7b9fc9eaa..01e8ec6d7 100644
--- a/vp8/encoder/tokenize.h
+++ b/vp8/encoder/tokenize.h
@@ -19,6 +19,12 @@ void vp8_tokenize_initialize();
 
 typedef struct
 {
+    short Token;
+    short Extra;
+} TOKENVALUE;
+
+typedef struct
+{
     int Token;
     int Extra;
     const vp8_prob *context_tree;
@@ -40,6 +46,6 @@ extern const int *vp8_dct_value_cost_ptr;
  *  improve cache locality, since it's needed for costing when the rest of the
  *  fields are not.
  */
-extern const TOKENEXTRA *vp8_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp8_dct_value_tokens_ptr;
 
 #endif  /* tokenize_h */
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 0341fbd9f..5befd3b86 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -15,9 +15,9 @@
 #define prototype_sad(sym)\
     unsigned int (sym)\
     (\
-     unsigned char *src_ptr, \
+     const unsigned char *src_ptr, \
      int source_stride, \
-     unsigned char *ref_ptr, \
+     const unsigned char *ref_ptr, \
      int  ref_stride, \
      int max_sad\
     )
@@ -25,17 +25,27 @@
 #define prototype_sad_multi_same_address(sym)\
     void (sym)\
     (\
-     unsigned char *src_ptr, \
+     const unsigned char *src_ptr, \
      int source_stride, \
-     unsigned char *ref_ptr, \
+     const unsigned char *ref_ptr, \
      int  ref_stride, \
      unsigned int *sad_array\
     )
 
+#define prototype_sad_multi_same_address_1(sym)\
+    void (sym)\
+    (\
+     const unsigned char *src_ptr, \
+     int source_stride, \
+     const unsigned char *ref_ptr, \
+     int  ref_stride, \
+     unsigned short *sad_array\
+    )
+
 #define prototype_sad_multi_dif_address(sym)\
     void (sym)\
     (\
-     unsigned char *src_ptr, \
+     const unsigned char *src_ptr, \
      int source_stride, \
      unsigned char *ref_ptr[4], \
      int  ref_stride, \
@@ -45,9 +55,9 @@
 #define prototype_variance(sym) \
     unsigned int (sym) \
     (\
-     unsigned char *src_ptr, \
+     const unsigned char *src_ptr, \
      int source_stride, \
-     unsigned char *ref_ptr, \
+     const unsigned char *ref_ptr, \
      int  ref_stride, \
      unsigned int *sse\
     )
@@ -55,9 +65,9 @@
 #define prototype_variance2(sym) \
     unsigned int (sym) \
     (\
-     unsigned char *src_ptr, \
+     const unsigned char *src_ptr, \
      int source_stride, \
-     unsigned char *ref_ptr, \
+     const unsigned char *ref_ptr, \
      int  ref_stride, \
      unsigned int *sse,\
      int *sum\
@@ -66,17 +76,17 @@
 #define prototype_subpixvariance(sym) \
     unsigned int (sym) \
     ( \
-      unsigned char  *src_ptr, \
+      const unsigned char  *src_ptr, \
       int  source_stride, \
       int  xoffset, \
       int  yoffset, \
-      unsigned char *ref_ptr, \
+      const unsigned char *ref_ptr, \
       int Refstride, \
       unsigned int *sse \
     );
 
 
-#define prototype_getmbss(sym) unsigned int (sym)(short *)
+#define prototype_getmbss(sym) unsigned int (sym)(const short *)
 
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/variance_x86.h"
@@ -138,6 +148,31 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
 #endif
 extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
 
+#ifndef vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8);
+
+#ifndef vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8);
+
+#ifndef vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8);
+
+#ifndef vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8);
+
+#ifndef vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
+
 //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_sad16x16x4d
@@ -219,6 +254,21 @@ extern prototype_subpixvariance(vp8_variance_subpixvar16x8);
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
 
+#ifndef vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar16x16_h);
+
+#ifndef vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar16x16_v);
+
+#ifndef vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv);
+
 #ifndef vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
 #endif
@@ -259,6 +309,7 @@ extern prototype_sad(vp8_variance_get4x4sse_cs);
 
 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
+typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
 typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
 typedef prototype_variance(*vp8_variance_fn_t);
 typedef prototype_variance2(*vp8_variance2_fn_t);
@@ -283,6 +334,9 @@ typedef struct
     vp8_subpixvariance_fn_t  subpixvar8x16;
     vp8_subpixvariance_fn_t  subpixvar16x8;
     vp8_subpixvariance_fn_t  subpixvar16x16;
+    vp8_variance_fn_t        halfpixvar16x16_h;
+    vp8_variance_fn_t        halfpixvar16x16_v;
+    vp8_variance_fn_t        halfpixvar16x16_hv;
     vp8_subpixvariance_fn_t  subpixmse16x16;
 
     vp8_getmbss_fn_t         getmbss;
@@ -299,6 +353,12 @@ typedef struct
     vp8_sad_multi_fn_t       sad8x8x3;
     vp8_sad_multi_fn_t       sad4x4x3;
 
+    vp8_sad_multi1_fn_t      sad16x16x8;
+    vp8_sad_multi1_fn_t      sad16x8x8;
+    vp8_sad_multi1_fn_t      sad8x16x8;
+    vp8_sad_multi1_fn_t      sad8x8x8;
+    vp8_sad_multi1_fn_t      sad4x4x8;
+
     vp8_sad_multi_d_fn_t     sad16x16x4d;
     vp8_sad_multi_d_fn_t     sad16x8x4d;
     vp8_sad_multi_d_fn_t     sad8x16x4d;
@@ -309,11 +369,15 @@ typedef struct
 
 typedef struct
 {
-    vp8_sad_fn_t  sdf;
-    vp8_sad_multi_fn_t sdx3f;
-    vp8_sad_multi_d_fn_t sdx4df;
-    vp8_variance_fn_t vf;
+    vp8_sad_fn_t            sdf;
+    vp8_variance_fn_t       vf;
     vp8_subpixvariance_fn_t svf;
+    vp8_variance_fn_t       svf_halfpix_h;
+    vp8_variance_fn_t       svf_halfpix_v;
+    vp8_variance_fn_t       svf_halfpix_hv;
+    vp8_sad_multi_fn_t      sdx3f;
+    vp8_sad_multi1_fn_t     sdx8f;
+    vp8_sad_multi_d_fn_t    sdx4df;
 } vp8_variance_fn_ptr_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -322,7 +386,4 @@ typedef struct
 #define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
 #endif
 
-/* TODO: Determine if this USEBILINEAR flag is necessary. */
-#define USEBILINEAR
-
 #endif
diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
index 179cd0d8e..95ec96cec 100644
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -24,7 +24,6 @@ const int vp8_six_tap[8][6] =
 };
 
 
-#ifdef USEBILINEAR
 const int VP8_FILTER_WEIGHT = 128;
 const int VP8_FILTER_SHIFT  =   7;
 const int vp8_bilinear_taps[8][2] =
@@ -41,7 +40,7 @@ const int vp8_bilinear_taps[8][2] =
 
 unsigned int vp8_get_mb_ss_c
 (
-    short *src_ptr
+    const short *src_ptr
 )
 {
     unsigned int i = 0, sum = 0;
@@ -58,9 +57,9 @@ unsigned int vp8_get_mb_ss_c
 
 
 void  vp8_variance(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     int  w,
     int  h,
@@ -90,9 +89,9 @@ void  vp8_variance(
 unsigned int
 vp8_get8x8var_c
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *SSE,
     int *Sum
@@ -106,9 +105,9 @@ vp8_get8x8var_c
 unsigned int
 vp8_get16x16var_c
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *SSE,
     int *Sum
@@ -123,9 +122,9 @@ vp8_get16x16var_c
 
 
 unsigned int vp8_variance16x16_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -139,9 +138,9 @@ unsigned int vp8_variance16x16_c(
 }
 
 unsigned int vp8_variance8x16_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -155,9 +154,9 @@ unsigned int vp8_variance8x16_c(
 }
 
 unsigned int vp8_variance16x8_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -172,9 +171,9 @@ unsigned int vp8_variance16x8_c(
 
 
 unsigned int vp8_variance8x8_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -188,9 +187,9 @@ unsigned int vp8_variance8x8_c(
 }
 
 unsigned int vp8_variance4x4_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -205,9 +204,9 @@ unsigned int vp8_variance4x4_c(
 
 
 unsigned int vp8_mse16x16_c(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -250,7 +249,7 @@ unsigned int vp8_mse16x16_c(
  ****************************************************************************/
 void vp8e_filter_block2d_bil_first_pass
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     unsigned short *output_ptr,
     unsigned int src_pixels_per_line,
     int pixel_step,
@@ -308,7 +307,7 @@ void vp8e_filter_block2d_bil_first_pass
  ****************************************************************************/
 void vp8e_filter_block2d_bil_second_pass
 (
-    unsigned short *src_ptr,
+    const unsigned short *src_ptr,
     unsigned char  *output_ptr,
     unsigned int  src_pixels_per_line,
     unsigned int  pixel_step,
@@ -366,7 +365,7 @@ void vp8e_filter_block2d_bil_second_pass
  ****************************************************************************/
 void vp8e_filter_block2d_bil
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     unsigned char *output_ptr,
     unsigned int src_pixels_per_line,
     int  *HFilter,
@@ -387,11 +386,11 @@ void vp8e_filter_block2d_bil
 
 unsigned int vp8_sub_pixel_variance4x4_c
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -415,11 +414,11 @@ unsigned int vp8_sub_pixel_variance4x4_c
 
 unsigned int vp8_sub_pixel_variance8x8_c
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -439,11 +438,11 @@ unsigned int vp8_sub_pixel_variance8x8_c
 
 unsigned int vp8_sub_pixel_variance16x16_c
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -461,13 +460,50 @@ unsigned int vp8_sub_pixel_variance16x16_c
     return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+
+unsigned int vp8_variance_halfpixvar16x16_h_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
 unsigned int vp8_sub_pixel_mse16x16_c
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -478,11 +514,11 @@ unsigned int vp8_sub_pixel_mse16x16_c
 
 unsigned int vp8_sub_pixel_variance16x8_c
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -502,11 +538,11 @@ unsigned int vp8_sub_pixel_variance16x8_c
 
 unsigned int vp8_sub_pixel_variance8x16_c
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -525,4 +561,3 @@ unsigned int vp8_sub_pixel_variance8x16_c
 
     return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
-#endif
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
deleted file mode 100644
index 9fb67613d..000000000
--- a/vp8/encoder/x86/csystemdependent.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "variance.h"
-#include "onyx_int.h"
-
-SADFunction *vp8_sad16x16;
-SADFunction *vp8_sad16x8;
-SADFunction *vp8_sad8x16;
-SADFunction *vp8_sad8x8;
-SADFunction *vp8_sad4x4;
-
-variance_function *vp8_variance4x4;
-variance_function *vp8_variance8x8;
-variance_function *vp8_variance8x16;
-variance_function *vp8_variance16x8;
-variance_function *vp8_variance16x16;
-
-
-variance_function *vp8_mse16x16;
-
-sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
-
-int (*vp8_block_error)(short *, short *);
-int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
-void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-
-extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride);
-
-extern int vp8_block_error_c(short *, short *);
-extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc);
-
-extern int vp8_block_error_mmx(short *, short *);
-extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc);
-
-extern int vp8_block_error_xmm(short *, short *);
-extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc);
-
-
-
-int (*vp8_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp8_get_mb_ss)(short *);
-void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-
-void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// c imports
-extern int vp8_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
-extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch);
-
-
-extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction vp8_sad16x16_c;
-extern SADFunction vp8_sad16x8_c;
-extern SADFunction vp8_sad8x16_c;
-extern SADFunction vp8_sad8x8_c;
-extern SADFunction vp8_sad4x4_c;
-
-extern SADFunction vp8_sad16x16_wmt;
-extern SADFunction vp8_sad16x8_wmt;
-extern SADFunction vp8_sad8x16_wmt;
-extern SADFunction vp8_sad8x8_wmt;
-extern SADFunction vp8_sad4x4_wmt;
-
-extern SADFunction vp8_sad16x16_mmx;
-extern SADFunction vp8_sad16x8_mmx;
-extern SADFunction vp8_sad8x16_mmx;
-extern SADFunction vp8_sad8x8_mmx;
-extern SADFunction vp8_sad4x4_mmx;
-
-extern variance_function vp8_variance16x16_c;
-extern variance_function vp8_variance8x16_c;
-extern variance_function vp8_variance16x8_c;
-extern variance_function vp8_variance8x8_c;
-extern variance_function vp8_variance4x4_c;
-extern variance_function vp8_mse16x16_c;
-
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c;
-
-extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// mmx imports
-extern int vp8_mbuverror_mmx(MACROBLOCK *mb);
-extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d);
-extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch);
-extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
-extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch);
-extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch);
-extern variance_function vp8_variance4x4_mmx;
-extern variance_function vp8_variance8x8_mmx;
-extern variance_function vp8_variance8x16_mmx;
-extern variance_function vp8_variance16x8_mmx;
-extern variance_function vp8_variance16x16_mmx;
-
-extern variance_function vp8_mse16x16_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx;
-
-extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get_mb_ss_mmx(short *);
-extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-
-// wmt imports
-extern int vp8_mbuverror_xmm(MACROBLOCK *mb);
-extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d);
-extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch);
-extern variance_function vp8_variance4x4_wmt;
-extern variance_function vp8_variance8x8_wmt;
-extern variance_function vp8_variance8x16_wmt;
-extern variance_function vp8_variance16x8_wmt;
-extern variance_function vp8_variance16x16_wmt;
-
-extern variance_function vp8_mse16x16_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt;
-extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr);
-extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-
-extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
-
-void vp8_cmachine_specific_config(void)
-{
-    int mmx_enabled;
-    int xmm_enabled;
-    int wmt_enabled;
-
-    vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
-
-    if (wmt_enabled)         // Willamette
-    {
-        // Willamette instruction set available:
-        vp8_mbuverror                = vp8_mbuverror_xmm;
-        /* The sse quantizer has not been updated to match the new exact
-         * quantizer introduced in commit e04e2935
-         */
-        vp8_fast_quantize_b            = vp8_fast_quantize_b_c;
-#if 0 //new fdct
-        vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
-        vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
-        vp8_fast_fdct4x4              = vp8_short_fdct4x4_mmx;
-        vp8_fast_fdct8x4              = vp8_short_fdct8x4_wmt;
-#else
-        vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
-        vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
-        vp8_fast_fdct4x4              = vp8_short_fdct4x4_c;
-        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
-#endif
-        vp8_subtract_b                = vp8_subtract_b_mmx;
-        vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
-        vp8_variance4x4              = vp8_variance4x4_mmx;
-        vp8_variance8x8              = vp8_variance8x8_mmx;
-        vp8_variance8x16             = vp8_variance8x16_wmt;
-        vp8_variance16x8             = vp8_variance16x8_wmt;
-        vp8_variance16x16            = vp8_variance16x16_wmt;
-        vp8_mse16x16                 = vp8_mse16x16_wmt;
-        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_wmt;
-        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_wmt;
-        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_wmt;
-        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_wmt;
-        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_wmt;
-        vp8_get_mb_ss                  = vp8_get_mb_ss_sse2;
-        vp8_get16x16pred_error        = vp8_get16x16pred_error_sse2;
-        vp8_get8x8var                = vp8_get8x8var_sse2;
-        vp8_get16x16var              = vp8_get16x16var_sse2;
-        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_mmx;
-        vp8_sad16x16                 = vp8_sad16x16_wmt;
-        vp8_sad16x8                  = vp8_sad16x8_wmt;
-        vp8_sad8x16                  = vp8_sad8x16_wmt;
-        vp8_sad8x8                   = vp8_sad8x8_wmt;
-        vp8_sad4x4                   = vp8_sad4x4_wmt;
-        vp8_block_error               = vp8_block_error_xmm;
-        vp8_mbblock_error             = vp8_mbblock_error_xmm;
-        vp8_subtract_mby              = vp8_subtract_mby_mmx;
-
-    }
-    else if (mmx_enabled)
-    {
-        // MMX instruction set available:
-        vp8_mbuverror                = vp8_mbuverror_mmx;
-        /* The mmx quantizer has not been updated to match the new exact
-         * quantizer introduced in commit e04e2935
-         */
-        vp8_fast_quantize_b            = vp8_fast_quantize_b_c;
-#if 0 // new fdct
-        vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
-        vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
-        vp8_fast_fdct4x4              = vp8_short_fdct4x4_mmx;
-        vp8_fast_fdct8x4              = vp8_short_fdct8x4_mmx;
-#else
-        vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
-        vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
-        vp8_fast_fdct4x4              = vp8_short_fdct4x4_c;
-        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
-#endif
-        vp8_subtract_b                = vp8_subtract_b_mmx;
-        vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
-        vp8_variance4x4              = vp8_variance4x4_mmx;
-        vp8_variance8x8              = vp8_variance8x8_mmx;
-        vp8_variance8x16             = vp8_variance8x16_mmx;
-        vp8_variance16x8             = vp8_variance16x8_mmx;
-        vp8_variance16x16            = vp8_variance16x16_mmx;
-        vp8_mse16x16                 = vp8_mse16x16_mmx;
-        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_mmx;
-        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_mmx;
-        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_mmx;
-        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_mmx;
-        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_mmx;
-        vp8_get_mb_ss                  = vp8_get_mb_ss_mmx;
-        vp8_get16x16pred_error        = vp8_get16x16pred_error_mmx;
-        vp8_get8x8var                = vp8_get8x8var_mmx;
-        vp8_get16x16var              = vp8_get16x16var_mmx;
-        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_mmx;
-        vp8_sad16x16                 = vp8_sad16x16_mmx;
-        vp8_sad16x8                  = vp8_sad16x8_mmx;
-        vp8_sad8x16                  = vp8_sad8x16_mmx;
-        vp8_sad8x8                   = vp8_sad8x8_mmx;
-        vp8_sad4x4                   = vp8_sad4x4_mmx;
-        vp8_block_error               = vp8_block_error_mmx;
-        vp8_mbblock_error             = vp8_mbblock_error_mmx;
-        vp8_subtract_mby              = vp8_subtract_mby_mmx;
-
-    }
-    else
-    {
-        // Pure C:
-        vp8_mbuverror                = vp8_mbuverror_c;
-        vp8_fast_quantize_b          = vp8_fast_quantize_b_c;
-        vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
-        vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
-        vp8_fast_fdct4x4              = vp8_short_fdct4x4_c;
-        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
-        vp8_subtract_b                = vp8_subtract_b_c;
-        vp8_subtract_mbuv             = vp8_subtract_mbuv_c;
-        vp8_variance4x4              = vp8_variance4x4_c;
-        vp8_variance8x8              = vp8_variance8x8_c;
-        vp8_variance8x16             = vp8_variance8x16_c;
-        vp8_variance16x8             = vp8_variance16x8_c;
-        vp8_variance16x16            = vp8_variance16x16_c;
-        vp8_mse16x16                 = vp8_mse16x16_c;
-        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_c;
-        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_c;
-        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_c;
-        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_c;
-        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_c;
-        vp8_get_mb_ss                  = vp8_get_mb_ss_c;
-        vp8_get16x16pred_error        = vp8_get16x16pred_error_c;
-        vp8_get8x8var                = vp8_get8x8var_c;
-        vp8_get16x16var              = vp8_get16x16var_c;
-        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_c;
-        vp8_sad16x16                 = vp8_sad16x16_c;
-        vp8_sad16x8                  = vp8_sad16x8_c;
-        vp8_sad8x16                  = vp8_sad8x16_c;
-        vp8_sad8x8                   = vp8_sad8x8_c;
-        vp8_sad4x4                   = vp8_sad4x4_c;
-        vp8_block_error               = vp8_block_error_c;
-        vp8_mbblock_error             = vp8_mbblock_error_c;
-        vp8_subtract_mby              = vp8_subtract_mby_c;
-    }
-
-}
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index b6cfc5ce0..5acaca875 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -35,7 +35,7 @@ sym(vp8_short_fdct4x4_mmx):
         mov     rsi,    arg(0) ;input
         mov     rdi,    arg(1) ;output
 
-        lea     rdx,    [dct_const_mmx GLOBAL]
+        lea     rdx,    [GLOBAL(dct_const_mmx)]
         movsxd  rax,    dword ptr arg(2) ;pitch
 
         lea     rcx,    [rsi + rax*2]
@@ -243,7 +243,7 @@ sym(vp8_short_fdct8x4_wmt):
         mov         rsi,    arg(0) ;input
         mov         rdi,    arg(1) ;output
 
-        lea         rdx,    [dct_const_xmm GLOBAL]
+        lea         rdx,    [GLOBAL(dct_const_xmm)]
         movsxd      rax,    dword ptr arg(2) ;pitch
 
         lea         rcx,    [rsi + rax*2]
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index f7a18432d..723a78d76 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -52,14 +52,14 @@ sym(vp8_short_fdct4x4_sse2):
     psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
     psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
     movdqa      xmm1, xmm0
-    pmaddwd     xmm0, XMMWORD PTR[_mult_add GLOBAL]     ;a1 + b1
-    pmaddwd     xmm1, XMMWORD PTR[_mult_sub GLOBAL]     ;a1 - b1
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
     movdqa      xmm4, xmm3
-    pmaddwd     xmm3, XMMWORD PTR[_5352_2217 GLOBAL]    ;c1*2217 + d1*5352
-    pmaddwd     xmm4, XMMWORD PTR[_2217_neg5352 GLOBAL] ;d1*2217 - c1*5352
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
 
-    paddd       xmm3, XMMWORD PTR[_14500 GLOBAL]
-    paddd       xmm4, XMMWORD PTR[_7500 GLOBAL]
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
     psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
     psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
 
@@ -80,7 +80,7 @@ sym(vp8_short_fdct4x4_sse2):
     punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
     punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
 
-    movdqa      xmm5, XMMWORD PTR[_7 GLOBAL]
+    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
     pshufd      xmm2, xmm2, 04eh
     movdqa      xmm3, xmm0
     paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
@@ -94,8 +94,8 @@ sym(vp8_short_fdct4x4_sse2):
     pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
     pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
     movdqa      xmm1, xmm0
-    pmaddwd     xmm0, XMMWORD PTR[_mult_add GLOBAL] ;a1 + b1
-    pmaddwd     xmm1, XMMWORD PTR[_mult_sub GLOBAL] ;a1 - b1
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
 
     pxor        xmm4, xmm4                      ;zero out for compare
     paddd       xmm0, xmm5
@@ -103,14 +103,14 @@ sym(vp8_short_fdct4x4_sse2):
     pcmpeqw     xmm2, xmm4
     psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
     psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
-    pandn       xmm2, XMMWORD PTR[_cmp_mask GLOBAL] ;clear upper,
-                                                    ;and keep bit 0 of lower
+    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
+                                                     ;and keep bit 0 of lower
 
     movdqa      xmm4, xmm3
-    pmaddwd     xmm3, XMMWORD PTR[_5352_2217 GLOBAL]    ;c1*2217 + d1*5352
-    pmaddwd     xmm4, XMMWORD PTR[_2217_neg5352 GLOBAL] ;d1*2217 - c1*5352
-    paddd       xmm3, XMMWORD PTR[_12000 GLOBAL]
-    paddd       xmm4, XMMWORD PTR[_51000 GLOBAL]
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
     packssdw    xmm0, xmm1                      ;op[8] op[0]
     psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
     psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h
index d090b2d89..69b3edd66 100644
--- a/vp8/encoder/x86/encodemb_x86.h
+++ b/vp8/encoder/x86/encodemb_x86.h
@@ -55,7 +55,9 @@ extern prototype_submbuv(vp8_subtract_mbuv_mmx);
 extern prototype_berr(vp8_block_error_xmm);
 extern prototype_mberr(vp8_mbblock_error_xmm);
 extern prototype_mbuverr(vp8_mbuverror_xmm);
-
+extern prototype_subb(vp8_subtract_b_sse2);
+extern prototype_submby(vp8_subtract_mby_sse2);
+extern prototype_submbuv(vp8_subtract_mbuv_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_encodemb_berr
@@ -67,6 +69,15 @@ extern prototype_mbuverr(vp8_mbuverror_xmm);
 #undef  vp8_encodemb_mbuverr
 #define vp8_encodemb_mbuverr vp8_mbuverror_xmm
 
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_sse2
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_sse2
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_sse2
+
 #endif
 #endif
 
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index 413d74d61..c0f06bbbb 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -50,7 +50,7 @@ sym(vp8_block_error_xmm):
         psrldq      xmm0,       8
         paddd       xmm0,       xmm3
 
-        movd        rax,        xmm0
+        movq        rax,        xmm0
 
     pop rdi
     pop rsi
@@ -115,7 +115,7 @@ sym(vp8_block_error_mmx):
         psrlq       mm1,        32
         paddd       mm0,        mm1
 
-        movd        rax,        mm0
+        movq        rax,        mm0
 
     pop rdi
     pop rsi
@@ -192,7 +192,7 @@ mberror_loop_mmx:
         psrlq       mm2,        32
 
         paddd       mm0,        mm2
-        movd        rax,        mm0
+        movq        rax,        mm0
 
     pop rdi
     pop rsi
@@ -260,7 +260,7 @@ mberror_loop:
         psrldq      xmm0,       8
 
         paddd       xmm0,       xmm1
-        movd        rax,        xmm0
+        movq        rax,        xmm0
 
     pop rdi
     pop rsi
@@ -317,7 +317,7 @@ mbuverror_loop_mmx:
         psrlq           mm7,        32
 
         paddd           mm0,        mm7
-        movd            rax,        mm0
+        movq            rax,        mm0
 
     pop rdi
     pop rsi
@@ -374,7 +374,7 @@ mbuverror_loop:
         psrldq      xmm1,           8
         paddd       xmm1,           xmm2
 
-        movd            rax,            xmm1
+        movq            rax,            xmm1
 
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
index 38812c8d1..39439f0d8 100644
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/vp8/encoder/x86/fwalsh_sse2.asm
@@ -17,102 +17,148 @@ sym(vp8_short_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM
+    GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog
 
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-
-    movdqu    xmm4, [rsi + 0]       ;ip[4] ip[0]
-    movdqu    xmm0, [rsi + 16]      ;ip[12] ip[8]
-
-    pxor  xmm7, xmm7
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm4          ;ip[4] ip[0]
-
-    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
+    mov     rsi, arg(0)           ; input
+    mov     rdi, arg(1)           ; output
+    movsxd  rdx, dword ptr arg(2) ; pitch
+
+    ; first for loop
+    movq    xmm0, MMWORD PTR [rsi]           ; load input
+    movq    xmm1, MMWORD PTR [rsi + rdx]
+    lea     rsi,  [rsi + rdx*2]
+    movq    xmm2, MMWORD PTR [rsi]
+    movq    xmm3, MMWORD PTR [rsi + rdx]
+
+    punpcklwd xmm0,  xmm1
+    punpcklwd xmm2,  xmm3
+
+    movdqa    xmm1, xmm0
+    punpckldq xmm0, xmm2           ; ip[1] ip[0]
+    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+
+    psllw     xmm0, 2              ; d1  a1
+    psllw     xmm2, 2              ; c1  b1
+
+    movdqa    xmm1, xmm0
+    punpcklqdq xmm0, xmm2          ; b1  a1
+    punpckhqdq xmm1, xmm2          ; c1  d1
+
+    pxor      xmm6, xmm6
+    movq      xmm6, xmm0
+    pxor      xmm7, xmm7
+    pcmpeqw   xmm7, xmm6
+    paddw     xmm7, [GLOBAL(c1)]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1           ; b1+c1  a1+d1
+    psubw     xmm2, xmm1           ; b1-c1  a1-d1
+    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
+
+    ; second for loop
+    ; input: 13  9  5  1 12  8  4  0 (xmm0)
+    ;        14 10  6  2 15 11  7  3 (xmm2)
+    ; after shuffle:
+    ;        13  5  9  1 12  4  8  0 (xmm0)
+    ;        14  6 10  2 15  7 11  3 (xmm1)
+    pshuflw   xmm3, xmm0, 0xd8
+    pshufhw   xmm0, xmm3, 0xd8
+    pshuflw   xmm3, xmm2, 0xd8
+    pshufhw   xmm1, xmm3, 0xd8
+
+    movdqa    xmm2, xmm0
+    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
+    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
+    movdqa    xmm3, xmm1
+    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
+    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
+
+    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
+    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
+    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
+    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
+
+    movdqa    xmm0, xmm4
+    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
+    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
+    movdqa    xmm1, xmm6
+    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
+    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
+
+    movdqa    xmm2, xmm0
+    paddd     xmm0, xmm4            ; b21 b20 a21 a20
+    psubd     xmm2, xmm4            ; c21 c20 d21 d20
+    movdqa    xmm3, xmm1
+    paddd     xmm1, xmm6            ; b23 b22 a23 a22
+    psubd     xmm3, xmm6            ; c23 c22 d23 d22
+
+    pxor      xmm4, xmm4
     movdqa    xmm5, xmm4
-    punpcklqdq  xmm4, xmm3          ;d1 a1
-    punpckhqdq  xmm5, xmm3          ;c1 b1
-
-    movdqa    xmm1, xmm5          ;c1 b1
-    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm5          ;ip[4] ip[0]
-
-    paddw   xmm5, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
-    movdqa    xmm6, xmm5
-    punpcklqdq  xmm5, xmm3          ;d1 a1
-    punpckhqdq  xmm6, xmm3          ;c1 b1
-
-    movdqa    xmm1, xmm6          ;c1 b1
-    paddw   xmm6, xmm5          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm5, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-
-    movdqa    xmm0, xmm6          ;aka b2 a2
-    movdqa    xmm1, xmm5          ;aka d2 c2
-
-    pcmpgtw   xmm0, xmm7
-    pcmpgtw   xmm1, xmm7
-
-    psrlw   xmm0, 15
-    psrlw   xmm1, 15
-
-    paddw   xmm6, xmm0
-    paddw   xmm5, xmm1
-
-    psraw   xmm6, 1
-    psraw   xmm5, 1
-
-    ;   a2 = a1 + b1;
-    ;   b2 = c1 + d1;
-    ;   c2 = a1 - b1;
-    ;   d2 = d1 - c1;
-    ;        a2 += (a2>0);
-    ;        b2 += (b2>0);
-    ;        c2 += (c2>0);
-    ;        d2 += (d2>0);
-    ;   op[0] = (a2)>>1;
-    ;   op[4] = (b2)>>1;
-    ;   op[8] = (c2)>>1;
-    ;   op[12]= (d2)>>1;
-
-    movdqu  [rdi + 0], xmm6
-    movdqu  [rdi + 16], xmm5
+    pcmpgtd   xmm4, xmm0
+    pcmpgtd   xmm5, xmm2
+    pand      xmm4, [GLOBAL(cd1)]
+    pand      xmm5, [GLOBAL(cd1)]
+
+    pxor      xmm6, xmm6
+    movdqa    xmm7, xmm6
+    pcmpgtd   xmm6, xmm1
+    pcmpgtd   xmm7, xmm3
+    pand      xmm6, [GLOBAL(cd1)]
+    pand      xmm7, [GLOBAL(cd1)]
+
+    paddd     xmm0, xmm4
+    paddd     xmm2, xmm5
+    paddd     xmm0, [GLOBAL(cd3)]
+    paddd     xmm2, [GLOBAL(cd3)]
+    paddd     xmm1, xmm6
+    paddd     xmm3, xmm7
+    paddd     xmm1, [GLOBAL(cd3)]
+    paddd     xmm3, [GLOBAL(cd3)]
+
+    psrad     xmm0, 3
+    psrad     xmm1, 3
+    psrad     xmm2, 3
+    psrad     xmm3, 3
+    movdqa    xmm4, xmm0
+    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
+    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
+    movdqa    xmm5, xmm2
+    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
+    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
+
+    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
+    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
+
+    movdqa  XMMWORD PTR [rdi], xmm0
+    movdqa  XMMWORD PTR [rdi + 16], xmm2
 
     ; begin epilog
     pop rdi
     pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
+
+SECTION_RODATA
+align 16
+c1:
+    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
+align 16
+cn1:
+    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
+align 16
+cd1:
+    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+align 16
+cd3:
+    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h
index e8d658b39..3b7b29c21 100644
--- a/vp8/encoder/x86/mcomp_x86.h
+++ b/vp8/encoder/x86/mcomp_x86.h
@@ -24,5 +24,14 @@
 #endif
 #endif
 
+#if HAVE_SSE4_1
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx8
+
+#endif
+#endif
+
 #endif
 
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
index a867409b5..f29a54ecd 100644
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -249,7 +249,7 @@ sym(vp8_fast_quantize_b_impl_mmx):
         paddd           mm0,        mm5
 
         ; eob adjustment begins here
-        movd            rcx,        mm0
+        movq            rcx,        mm0
         and             rcx,        0xffff
 
         xor             rdx,        rdx
@@ -262,7 +262,7 @@ sym(vp8_fast_quantize_b_impl_mmx):
         and             rax,        rdx
         ; Substitute the sse assembly for the old mmx mixed assembly/C. The
         ; following is kept as reference
-        ;    movd            rcx,        mm0
+        ;    movq            rcx,        mm0
         ;    bsr             rax,        rcx
         ;
         ;    mov             eob,        rax
@@ -284,156 +284,3 @@ sym(vp8_fast_quantize_b_impl_mmx):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_sse)
-sym(vp8_fast_quantize_b_impl_sse):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movdqa          xmm0,       [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movdqa          xmm1,       [rax]
-
-        movdqa          xmm3,       xmm0
-        psraw           xmm0,       15
-
-        pxor            xmm3,       xmm0
-        psubw           xmm3,       xmm0            ; abs
-
-        movdqa          xmm2,       xmm3
-        pcmpgtw         xmm1,       xmm2
-
-        pandn           xmm1,       xmm2
-        movdqa          xmm3,       xmm1
-
-        mov             rdx,        arg(6) ; quant_ptr
-        movdqa          xmm1,       [rdx]
-
-        mov             rcx,        arg(5) ; round_ptr
-        movdqa          xmm2,       [rcx]
-
-        paddw           xmm3,       xmm2
-        pmulhuw         xmm3,       xmm1
-
-        pxor            xmm3,       xmm0
-        psubw           xmm3,       xmm0        ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movdqa          xmm0,       xmm3
-
-        movdqa          [rdi],      xmm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movdqa          xmm2,       [rax]
-
-        pmullw          xmm3,       xmm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movdqa          [rax],      xmm3
-
-        ; next 8
-        movdqa          xmm4,       [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movdqa          xmm5,       [rax+16]
-
-        movdqa          xmm7,       xmm4
-        psraw           xmm4,       15
-
-        pxor            xmm7,       xmm4
-        psubw           xmm7,       xmm4            ; abs
-
-        movdqa          xmm6,       xmm7
-        pcmpgtw         xmm5,       xmm6
-
-        pandn           xmm5,       xmm6
-        movdqa          xmm7,       xmm5
-
-        movdqa          xmm5,       [rdx+16]
-        movdqa          xmm6,       [rcx+16]
-
-
-        paddw           xmm7,       xmm6
-        pmulhuw         xmm7,       xmm5
-
-        pxor            xmm7,       xmm4
-        psubw           xmm7,       xmm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movdqa          xmm1,       xmm7
-        movdqa          [rdi+16],   xmm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movdqa          xmm6,       [rax+16]
-
-        pmullw          xmm7,       xmm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movdqa          [rax+16],   xmm7
-        mov             rdi,        arg(4) ;scan_mask
-
-        pxor            xmm7,       xmm7
-        movdqa          xmm2,       [rdi]
-
-        movdqa          xmm3,       [rdi+16];
-        pcmpeqw         xmm0,       xmm7
-
-        pcmpeqw         xmm1,       xmm7
-        pcmpeqw         xmm6,       xmm6
-
-        pxor            xmm0,       xmm6
-        pxor            xmm1,       xmm6
-
-        psrlw           xmm0,       15
-        psrlw           xmm1,       15
-
-        pmaddwd         xmm0,       xmm2
-        pmaddwd         xmm1,       xmm3
-
-        movq            xmm2,       xmm0
-        movq            xmm3,       xmm1
-
-        psrldq          xmm0,       8
-        psrldq          xmm1,       8
-
-        paddd           xmm0,       xmm1
-        paddd           xmm2,       xmm3
-
-        paddd           xmm0,       xmm2
-        movq            xmm1,       xmm0
-
-        psrldq          xmm0,       4
-        paddd           xmm1,       xmm0
-
-        movd            rcx,        xmm1
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index a1b1c40cb..1e0bd5c48 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -252,3 +252,137 @@ rq_zigzag_1c:
     UNSHADOW_ARGS
     pop         rbp
     ret
+
+
+;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_sse2)
+sym(vp8_fast_quantize_b_impl_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+
+    %define save_xmm6  0
+    %define save_xmm7 16
+
+    %define vp8_fastquantizeb_stack_size save_xmm7 + 16
+
+    sub         rsp, vp8_fastquantizeb_stack_size
+
+    movdqa      XMMWORD PTR[rsp + save_xmm6], xmm6
+    movdqa      XMMWORD PTR[rsp + save_xmm7], xmm7
+
+    mov         rdx, arg(0)                 ;coeff_ptr
+    mov         rcx, arg(2)                 ;dequant_ptr
+    mov         rax, arg(3)                 ;scan_mask
+    mov         rdi, arg(4)                 ;round_ptr
+    mov         rsi, arg(5)                 ;quant_ptr
+
+    movdqa      xmm0, XMMWORD PTR[rdx]
+    movdqa      xmm4, XMMWORD PTR[rdx + 16]
+
+    movdqa      xmm6, XMMWORD PTR[rdi]      ;round lo
+    movdqa      xmm7, XMMWORD PTR[rdi + 16] ;round hi
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    psraw       xmm0, 15                    ;sign of z (aka sz)
+    psraw       xmm4, 15                    ;sign of z (aka sz)
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0                  ;x = abs(z)
+    psubw       xmm5, xmm4                  ;x = abs(z)
+
+    paddw       xmm1, xmm6
+    paddw       xmm5, xmm7
+
+    pmulhw      xmm1, XMMWORD PTR[rsi]
+    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
+
+    mov         rdi, arg(1)                 ;qcoeff_ptr
+    mov         rsi, arg(6)                 ;dqcoeff_ptr
+
+    movdqa      xmm6, XMMWORD PTR[rcx]
+    movdqa      xmm7, XMMWORD PTR[rcx + 16]
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      XMMWORD PTR[rdi], xmm1
+    movdqa      XMMWORD PTR[rdi + 16], xmm5
+
+    pmullw      xmm6, xmm1
+    pmullw      xmm7, xmm5
+
+    movdqa      xmm2, XMMWORD PTR[rax]
+    movdqa      xmm3, XMMWORD PTR[rax+16];
+
+    pxor        xmm4, xmm4            ;clear all bits
+    pcmpeqw     xmm1, xmm4
+    pcmpeqw     xmm5, xmm4
+
+    pcmpeqw     xmm4, xmm4            ;set all bits
+    pxor        xmm1, xmm4
+    pxor        xmm5, xmm4
+
+    psrlw       xmm1, 15
+    psrlw       xmm5, 15
+
+    pmaddwd     xmm1, xmm2
+    pmaddwd     xmm5, xmm3
+
+    movq        xmm2, xmm1
+    movq        xmm3, xmm5
+
+    psrldq      xmm1, 8
+    psrldq      xmm5, 8
+
+    paddd       xmm1, xmm5
+    paddd       xmm2, xmm3
+
+    paddd       xmm1, xmm2
+    movq        xmm5, xmm1
+
+    psrldq      xmm1, 4
+    paddd       xmm5, xmm1
+
+    movq        rcx,  xmm5
+    and         rcx,  0xffff
+
+    xor         rdx,  rdx
+    sub         rdx,  rcx
+
+    bsr         rax,  rcx
+    inc         rax
+
+    sar         rdx,  31
+    and         rax,  rdx
+
+    movdqa      XMMWORD PTR[rsi], xmm6        ;store dqcoeff
+    movdqa      XMMWORD PTR[rsi + 16], xmm7   ;store dqcoeff
+
+    movdqa      xmm6, XMMWORD PTR[rsp + save_xmm6]
+    movdqa      xmm7, XMMWORD PTR[rsp + save_xmm7]
+
+    add         rsp, vp8_fastquantizeb_stack_size
+    pop         rsp
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
new file mode 100755
index 000000000..2f33199e5
--- /dev/null
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -0,0 +1,114 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
+;               short *qcoeff_ptr,short *dequant_ptr,
+;               short *round_ptr,
+;               short *quant_ptr, short *dqcoeff_ptr);
+;
+global sym(vp8_fast_quantize_b_impl_ssse3)
+sym(vp8_fast_quantize_b_impl_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rdx, arg(0)                 ;coeff_ptr
+    mov         rdi, arg(3)                 ;round_ptr
+    mov         rsi, arg(4)                 ;quant_ptr
+
+    movdqa      xmm0, [rdx]
+    movdqa      xmm4, [rdx + 16]
+
+    movdqa      xmm2, [rdi]                 ;round lo
+    movdqa      xmm3, [rdi + 16]            ;round hi
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    psraw       xmm0, 15                    ;sign of z (aka sz)
+    psraw       xmm4, 15                    ;sign of z (aka sz)
+
+    pabsw       xmm1, xmm1
+    pabsw       xmm5, xmm5
+
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    pmulhw      xmm1, [rsi]
+    pmulhw      xmm5, [rsi + 16]
+
+    mov         rdi, arg(1)                 ;qcoeff_ptr
+    mov         rcx, arg(2)                 ;dequant_ptr
+    mov         rsi, arg(5)                 ;dqcoeff_ptr
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      [rdi], xmm1
+    movdqa      [rdi + 16], xmm5
+
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+
+    pxor        xmm4, xmm4
+    pmullw      xmm2, xmm1
+    pmullw      xmm3, xmm5
+
+    pcmpeqw     xmm1, xmm4                  ;non zero mask
+    pcmpeqw     xmm5, xmm4                  ;non zero mask
+    packsswb    xmm1, xmm5
+    pshufb      xmm1, [ GLOBAL(zz_shuf)]
+
+    pmovmskb    edx, xmm1
+
+;    xor         ecx, ecx
+;    mov         eax, -1
+;find_eob_loop:
+;    shr         edx, 1
+;    jc          fq_skip
+;    mov         eax, ecx
+;fq_skip:
+;    inc         ecx
+;    cmp         ecx, 16
+;    jne         find_eob_loop
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax                      ;flip the bits for bsr
+    bsr         eax, edx
+
+    movdqa      [rsi], xmm2                 ;store dqcoeff
+    movdqa      [rsi + 16], xmm3            ;store dqcoeff
+
+    sub         edi, edx                    ;check for all zeros in bit mask
+    sar         edi, 31                     ;0 or -1
+    add         eax, 1
+    and         eax, edi                    ;if the bit mask was all zero,
+                                            ;then eob = 0
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
index ad9658bf6..85cb023a4 100644
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -17,8 +17,6 @@ global sym(vp8_sad8x8_mmx)
 global sym(vp8_sad4x4_mmx)
 global sym(vp8_sad16x8_mmx)
 
-%idefine QWORD
-
 ;unsigned int vp8_sad16x16_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -100,7 +98,7 @@ x16x16sad_mmx_loop:
         psrlq           mm0,        32
         paddw           mm7,        mm0
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 
     pop rdi
     pop rsi
@@ -172,7 +170,7 @@ x8x16sad_mmx_loop:
         psrlq           mm0,        32
 
         paddw           mm7,        mm0
-        movd            rax,        mm7
+        movq            rax,        mm7
 
     pop rdi
     pop rsi
@@ -242,7 +240,7 @@ x8x8sad_mmx_loop:
         psrlq           mm0,        32
 
         paddw           mm7,        mm0
-        movd            rax,        mm7
+        movq            rax,        mm7
 
     pop rdi
     pop rsi
@@ -272,11 +270,11 @@ sym(vp8_sad4x4_mmx):
         movsxd          rax,        dword ptr arg(1) ;src_stride
         movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movd            mm0,       QWORD PTR [rsi]
-        movd            mm1,       QWORD PTR [rdi]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
 
-        movd            mm2,       QWORD PTR [rsi+rax]
-        movd            mm3,       QWORD PTR [rdi+rdx]
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
@@ -298,11 +296,11 @@ sym(vp8_sad4x4_mmx):
         lea             rsi,        [rsi+rax*2]
         lea             rdi,        [rdi+rdx*2]
 
-        movd            mm4,       QWORD PTR [rsi]
-        movd            mm5,       QWORD PTR [rdi]
+        movd            mm4,        DWORD PTR [rsi]
+        movd            mm5,        DWORD PTR [rdi]
 
-        movd            mm6,       QWORD PTR [rsi+rax]
-        movd            mm7,       QWORD PTR [rdi+rdx]
+        movd            mm6,        DWORD PTR [rsi+rax]
+        movd            mm7,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm4,        mm6
         punpcklbw       mm5,        mm7
@@ -331,7 +329,7 @@ sym(vp8_sad4x4_mmx):
         psrlq           mm0,        32
         paddw           mm0,        mm1
 
-        movd            rax,        mm0
+        movq            rax,        mm0
 
     pop rdi
     pop rsi
@@ -418,7 +416,7 @@ x16x8sad_mmx_loop:
         psrlq           mm0,        32
 
         paddw           mm7,        mm0
-        movd            rax,        mm7
+        movq            rax,        mm7
 
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 9f34a7ac4..39ed79604 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -11,8 +11,6 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-%idefine QWORD
-
 ;unsigned int vp8_sad16x16_wmt(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
@@ -75,7 +73,7 @@ x16x16sad_wmt_loop:
         psrldq          xmm7,       8
 
         paddw           xmm0,       xmm7
-        movd            rax,        xmm0
+        movq            rax,        xmm0
 
     ; begin epilog
     pop rdi
@@ -113,7 +111,7 @@ sym(vp8_sad8x16_wmt):
 
 x8x16sad_wmt_loop:
 
-        movd            rax,        mm7
+        movq            rax,        mm7
         cmp             rax,        arg(4)
         jg              x8x16sad_wmt_early_exit
 
@@ -135,7 +133,7 @@ x8x16sad_wmt_loop:
         cmp             rsi,        rcx
         jne             x8x16sad_wmt_loop
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 
 x8x16sad_wmt_early_exit:
 
@@ -174,7 +172,7 @@ sym(vp8_sad8x8_wmt):
 
 x8x8sad_wmt_loop:
 
-        movd            rax,        mm7
+        movq            rax,        mm7
         cmp             rax,        arg(4)
         jg              x8x8sad_wmt_early_exit
 
@@ -190,7 +188,7 @@ x8x8sad_wmt_loop:
         cmp             rsi,        rcx
         jne             x8x8sad_wmt_loop
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 x8x8sad_wmt_early_exit:
 
     ; begin epilog
@@ -221,11 +219,11 @@ sym(vp8_sad4x4_wmt):
         movsxd          rax,        dword ptr arg(1) ;src_stride
         movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movd            mm0,       QWORD PTR [rsi]
-        movd            mm1,       QWORD PTR [rdi]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
 
-        movd            mm2,       QWORD PTR [rsi+rax]
-        movd            mm3,       QWORD PTR [rdi+rdx]
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
@@ -234,19 +232,19 @@ sym(vp8_sad4x4_wmt):
         lea             rsi,        [rsi+rax*2]
 
         lea             rdi,        [rdi+rdx*2]
-        movd            mm4,       QWORD PTR [rsi]
+        movd            mm4,        DWORD PTR [rsi]
 
-        movd            mm5,       QWORD PTR [rdi]
-        movd            mm6,       QWORD PTR [rsi+rax]
+        movd            mm5,        DWORD PTR [rdi]
+        movd            mm6,        DWORD PTR [rsi+rax]
 
-        movd            mm7,       QWORD PTR [rdi+rdx]
+        movd            mm7,        DWORD PTR [rdi+rdx]
         punpcklbw       mm4,        mm6
 
         punpcklbw       mm5,        mm7
         psadbw          mm4,        mm5
 
         paddw           mm0,        mm4
-        movd            rax,        mm0
+        movq            rax,        mm0
 
     ; begin epilog
     pop rdi
@@ -283,7 +281,7 @@ sym(vp8_sad16x8_wmt):
 
 x16x8sad_wmt_loop:
 
-        movd            rax,        mm7
+        movq            rax,        mm7
         cmp             rax,        arg(4)
         jg              x16x8sad_wmt_early_exit
 
@@ -317,7 +315,7 @@ x16x8sad_wmt_loop:
         cmp             rsi,        rcx
         jne             x16x8sad_wmt_loop
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 
 x16x8sad_wmt_early_exit:
 
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index c2a1ae70a..1b7293c20 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -11,23 +11,21 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-%idefine QWORD
-
 %macro PROCESS_16X2X3 1
 %if %1
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm5,       [rdi]
-        lddqu           xmm6,       [rdi+1]
-        lddqu           xmm7,       [rdi+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm5,       XMMWORD PTR [rdi]
+        lddqu           xmm6,       XMMWORD PTR [rdi+1]
+        lddqu           xmm7,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm5,       xmm0
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm1,       [rdi]
-        lddqu           xmm2,       [rdi+1]
-        lddqu           xmm3,       [rdi+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
+        lddqu           xmm2,       XMMWORD PTR [rdi+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
@@ -37,10 +35,10 @@
         paddw           xmm6,       xmm2
         paddw           xmm7,       xmm3
 %endif
-        movdqa          xmm0,       QWORD PTR [rsi+rax]
-        lddqu           xmm1,       QWORD PTR [rdi+rdx]
-        lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
 
         lea             rsi,        [rsi+rax*2]
         lea             rdi,        [rdi+rdx*2]
@@ -56,19 +54,19 @@
 
 %macro PROCESS_8X2X3 1
 %if %1
-        movq            mm0,       [rsi]
-        movq            mm5,       [rdi]
-        movq            mm6,       [rdi+1]
-        movq            mm7,       [rdi+2]
+        movq            mm0,       QWORD PTR [rsi]
+        movq            mm5,       QWORD PTR [rdi]
+        movq            mm6,       QWORD PTR [rdi+1]
+        movq            mm7,       QWORD PTR [rdi+2]
 
         psadbw          mm5,       mm0
         psadbw          mm6,       mm0
         psadbw          mm7,       mm0
 %else
-        movq            mm0,       [rsi]
-        movq            mm1,       [rdi]
-        movq            mm2,       [rdi+1]
-        movq            mm3,       [rdi+2]
+        movq            mm0,       QWORD PTR [rsi]
+        movq            mm1,       QWORD PTR [rdi]
+        movq            mm2,       QWORD PTR [rdi+1]
+        movq            mm3,       QWORD PTR [rdi+2]
 
         psadbw          mm1,       mm0
         psadbw          mm2,       mm0
@@ -105,45 +103,45 @@
 
 %macro PROCESS_16X2X4 1
 %if %1
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm4,       [rcx]
-        lddqu           xmm5,       [rdx]
-        lddqu           xmm6,       [rbx]
-        lddqu           xmm7,       [rdi]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm4,       XMMWORD PTR [rcx]
+        lddqu           xmm5,       XMMWORD PTR [rdx]
+        lddqu           xmm6,       XMMWORD PTR [rbx]
+        lddqu           xmm7,       XMMWORD PTR [rdi]
 
         psadbw          xmm4,       xmm0
         psadbw          xmm5,       xmm0
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm1,       [rcx]
-        lddqu           xmm2,       [rdx]
-        lddqu           xmm3,       [rbx]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rcx]
+        lddqu           xmm2,       XMMWORD PTR [rdx]
+        lddqu           xmm3,       XMMWORD PTR [rbx]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
         psadbw          xmm3,       xmm0
 
         paddw           xmm4,       xmm1
-        lddqu           xmm1,       [rdi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
         paddw           xmm5,       xmm2
         paddw           xmm6,       xmm3
 
         psadbw          xmm1,       xmm0
         paddw           xmm7,       xmm1
 %endif
-        movdqa          xmm0,       QWORD PTR [rsi+rax]
-        lddqu           xmm1,       QWORD PTR [rcx+rbp]
-        lddqu           xmm2,       QWORD PTR [rdx+rbp]
-        lddqu           xmm3,       QWORD PTR [rbx+rbp]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rcx+rbp]
+        lddqu           xmm2,       XMMWORD PTR [rdx+rbp]
+        lddqu           xmm3,       XMMWORD PTR [rbx+rbp]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
         psadbw          xmm3,       xmm0
 
         paddw           xmm4,       xmm1
-        lddqu           xmm1,       QWORD PTR [rdi+rbp]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rbp]
         paddw           xmm5,       xmm2
         paddw           xmm6,       xmm3
 
@@ -162,28 +160,28 @@
 
 %macro PROCESS_8X2X4 1
 %if %1
-        movq            mm0,        [rsi]
-        movq            mm4,        [rcx]
-        movq            mm5,        [rdx]
-        movq            mm6,        [rbx]
-        movq            mm7,        [rdi]
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm4,        QWORD PTR [rcx]
+        movq            mm5,        QWORD PTR [rdx]
+        movq            mm6,        QWORD PTR [rbx]
+        movq            mm7,        QWORD PTR [rdi]
 
         psadbw          mm4,        mm0
         psadbw          mm5,        mm0
         psadbw          mm6,        mm0
         psadbw          mm7,        mm0
 %else
-        movq            mm0,        [rsi]
-        movq            mm1,        [rcx]
-        movq            mm2,        [rdx]
-        movq            mm3,        [rbx]
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rcx]
+        movq            mm2,        QWORD PTR [rdx]
+        movq            mm3,        QWORD PTR [rbx]
 
         psadbw          mm1,        mm0
         psadbw          mm2,        mm0
         psadbw          mm3,        mm0
 
         paddw           mm4,        mm1
-        movq            mm1,        [rdi]
+        movq            mm1,        QWORD PTR [rdi]
         paddw           mm5,        mm2
         paddw           mm6,        mm3
 
@@ -430,20 +428,20 @@ sym(vp8_sad4x4x3_sse3):
         movsxd          rax,        dword ptr arg(1) ;src_stride
         movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
-        movd            mm0,        QWORD PTR [rsi]
-        movd            mm1,        QWORD PTR [rdi]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
 
-        movd            mm2,        QWORD PTR [rsi+rax]
-        movd            mm3,        QWORD PTR [rdi+rdx]
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
 
-        movd            mm4,        QWORD PTR [rdi+1]
-        movd            mm5,        QWORD PTR [rdi+2]
+        movd            mm4,        DWORD PTR [rdi+1]
+        movd            mm5,        DWORD PTR [rdi+2]
 
-        movd            mm2,        QWORD PTR [rdi+rdx+1]
-        movd            mm3,        QWORD PTR [rdi+rdx+2]
+        movd            mm2,        DWORD PTR [rdi+rdx+1]
+        movd            mm3,        DWORD PTR [rdi+rdx+2]
 
         psadbw          mm1,        mm0
 
@@ -458,24 +456,24 @@ sym(vp8_sad4x4x3_sse3):
         lea             rsi,        [rsi+rax*2]
         lea             rdi,        [rdi+rdx*2]
 
-        movd            mm0,        QWORD PTR [rsi]
-        movd            mm2,        QWORD PTR [rdi]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm2,        DWORD PTR [rdi]
 
-        movd            mm3,        QWORD PTR [rsi+rax]
-        movd            mm6,        QWORD PTR [rdi+rdx]
+        movd            mm3,        DWORD PTR [rsi+rax]
+        movd            mm6,        DWORD PTR [rdi+rdx]
 
         punpcklbw       mm0,        mm3
         punpcklbw       mm2,        mm6
 
-        movd            mm3,        QWORD PTR [rdi+1]
-        movd            mm7,        QWORD PTR [rdi+2]
+        movd            mm3,        DWORD PTR [rdi+1]
+        movd            mm7,        DWORD PTR [rdi+2]
 
         psadbw          mm2,        mm0
 
         paddw           mm1,        mm2
 
-        movd            mm2,        QWORD PTR [rdi+rdx+1]
-        movd            mm6,        QWORD PTR [rdi+rdx+2]
+        movd            mm2,        DWORD PTR [rdi+rdx+1]
+        movd            mm6,        DWORD PTR [rdi+rdx+2]
 
         punpcklbw       mm3,        mm2
         punpcklbw       mm7,        mm6
@@ -530,7 +528,7 @@ sym(vp8_sad16x16_sse3):
 
 vp8_sad16x16_sse3_loop:
 
-        movd            rax,        mm7
+        movq            rax,        mm7
         cmp             rax,        arg(4)
         jg              vp8_sad16x16_early_exit
 
@@ -564,7 +562,7 @@ vp8_sad16x16_sse3_loop:
         cmp             rsi,        rcx
         jne             vp8_sad16x16_sse3_loop
 
-        movd            rax,        mm7
+        movq            rax,        mm7
 
 vp8_sad16x16_early_exit:
 
@@ -846,23 +844,23 @@ sym(vp8_sad4x4x4d_sse3):
 
         xchg            rbx,        rax
 
-        movd            mm0,        QWORD PTR [rsi]
-        movd            mm1,        QWORD PTR [rcx]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rcx]
 
-        movd            mm2,        QWORD PTR [rsi+rax]
-        movd            mm3,        QWORD PTR [rcx+rbp]
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rcx+rbp]
 
         punpcklbw       mm0,        mm2
         punpcklbw       mm1,        mm3
 
-        movd            mm4,        QWORD PTR [rdx]
-        movd            mm5,        QWORD PTR [rbx]
+        movd            mm4,        DWORD PTR [rdx]
+        movd            mm5,        DWORD PTR [rbx]
 
-        movd            mm6,        QWORD PTR [rdi]
-        movd            mm2,        QWORD PTR [rdx+rbp]
+        movd            mm6,        DWORD PTR [rdi]
+        movd            mm2,        DWORD PTR [rdx+rbp]
 
-        movd            mm3,        QWORD PTR [rbx+rbp]
-        movd            mm7,        QWORD PTR [rdi+rbp]
+        movd            mm3,        DWORD PTR [rbx+rbp]
+        movd            mm7,        DWORD PTR [rdi+rbp]
 
         psadbw          mm1,        mm0
 
@@ -885,17 +883,17 @@ sym(vp8_sad4x4x4d_sse3):
 
         lea             rdi,        [rdi+rbp*2]
 
-        movd            mm0,        QWORD PTR [rsi]
-        movd            mm2,        QWORD PTR [rcx]
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm2,        DWORD PTR [rcx]
 
-        movd            mm3,        QWORD PTR [rsi+rax]
-        movd            mm7,        QWORD PTR [rcx+rbp]
+        movd            mm3,        DWORD PTR [rsi+rax]
+        movd            mm7,        DWORD PTR [rcx+rbp]
 
         punpcklbw       mm0,        mm3
         punpcklbw       mm2,        mm7
 
-        movd            mm3,        QWORD PTR [rdx]
-        movd            mm7,        QWORD PTR [rbx]
+        movd            mm3,        DWORD PTR [rdx]
+        movd            mm7,        DWORD PTR [rbx]
 
         psadbw          mm2,        mm0
         mov             rax,        rbp
@@ -906,8 +904,8 @@ sym(vp8_sad4x4x4d_sse3):
         paddw           mm1,        mm2
         movd            [rsi],      mm1
 
-        movd            mm2,        QWORD PTR [rdx+rax]
-        movd            mm1,        QWORD PTR [rbx+rax]
+        movd            mm2,        DWORD PTR [rdx+rax]
+        movd            mm1,        DWORD PTR [rbx+rax]
 
         punpcklbw       mm3,        mm2
         punpcklbw       mm7,        mm1
@@ -915,8 +913,8 @@ sym(vp8_sad4x4x4d_sse3):
         psadbw          mm3,        mm0
         psadbw          mm7,        mm0
 
-        movd            mm2,        QWORD PTR [rdi]
-        movd            mm1,        QWORD PTR [rdi+rax]
+        movd            mm2,        DWORD PTR [rdi]
+        movd            mm1,        DWORD PTR [rdi+rax]
 
         paddw           mm3,        mm4
         paddw           mm7,        mm5
diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm
new file mode 100644
index 000000000..21e2e5007
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm1,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm1,       xmm2
+        paddw           xmm1,       xmm3
+        paddw           xmm1,       xmm4
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm1,       xmm2
+%else
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endif
+        movq            xmm0,       MMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+        movd            xmm0,       [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        mpsadbw         xmm1,       xmm0,  0x0
+%else
+        movd            xmm0,       [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endif
+        movd            xmm0,       [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4)
+sym(vp8_sad16x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad16x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4)
+sym(vp8_sad16x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4)
+sym(vp8_sad8x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4)
+sym(vp8_sad8x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad4x4x8_c(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4)
+sym(vp8_sad4x4x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_4X2X8 1
+        PROCESS_4X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqu          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 94bbfffbc..69c5eaedc 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -11,23 +11,21 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-%idefine QWORD
-
 %macro PROCESS_16X2X3 1
 %if %1
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm5,       [rdi]
-        lddqu           xmm6,       [rdi+1]
-        lddqu           xmm7,       [rdi+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm5,       XMMWORD PTR [rdi]
+        lddqu           xmm6,       XMMWORD PTR [rdi+1]
+        lddqu           xmm7,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm5,       xmm0
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       [rsi]
-        lddqu           xmm1,       [rdi]
-        lddqu           xmm2,       [rdi+1]
-        lddqu           xmm3,       [rdi+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
+        lddqu           xmm2,       XMMWORD PTR [rdi+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+2]
 
         psadbw          xmm1,       xmm0
         psadbw          xmm2,       xmm0
@@ -37,10 +35,10 @@
         paddw           xmm6,       xmm2
         paddw           xmm7,       xmm3
 %endif
-        movdqa          xmm0,       QWORD PTR [rsi+rax]
-        lddqu           xmm1,       QWORD PTR [rdi+rdx]
-        lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
 
         lea             rsi,        [rsi+rax*2]
         lea             rdi,        [rdi+rdx*2]
@@ -56,9 +54,9 @@
 
 %macro PROCESS_16X2X3_OFFSET 2
 %if %1
-        movdqa          xmm0,       [rsi]
-        movdqa          xmm4,       [rdi]
-        movdqa          xmm7,       [rdi+16]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm7,       XMMWORD PTR [rdi+16]
 
         movdqa          xmm5,       xmm7
         palignr         xmm5,       xmm4,       %2
@@ -72,9 +70,9 @@
         psadbw          xmm6,       xmm0
         psadbw          xmm7,       xmm0
 %else
-        movdqa          xmm0,       [rsi]
-        movdqa          xmm4,       [rdi]
-        movdqa          xmm3,       [rdi+16]
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm3,       XMMWORD PTR [rdi+16]
 
         movdqa          xmm1,       xmm3
         palignr         xmm1,       xmm4,       %2
@@ -92,9 +90,9 @@
         paddw           xmm6,       xmm2
         paddw           xmm7,       xmm3
 %endif
-        movdqa          xmm0,       QWORD PTR [rsi+rax]
-        movdqa          xmm4,       QWORD PTR [rdi+rdx]
-        movdqa          xmm3,       QWORD PTR [rdi+rdx+16]
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
+        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
 
         movdqa          xmm1,       xmm3
         palignr         xmm1,       xmm4,       %2
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index 8fe3ee174..a47e1f0d6 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            unsigned short *diff, unsigned char *Predictor,
+;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
 global sym(vp8_subtract_b_mmx_impl)
 sym(vp8_subtract_b_mmx_impl):
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
new file mode 100644
index 000000000..3fb23d097
--- /dev/null
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -0,0 +1,356 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+;                            short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp8_subtract_b_sse2_impl)
+sym(vp8_subtract_b_sse2_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi],      mm0
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*2], mm0
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*4], mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*2], mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_sse2)
+sym(vp8_subtract_mby_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+            mov         rsi,            arg(1) ;src
+            mov         rdi,            arg(0) ;diff
+
+            mov         rax,            arg(2) ;pred
+            movsxd      rdx,            dword ptr arg(3) ;stride
+
+            mov         rcx,            8      ; do two lines at one time
+
+submby_loop:
+            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
+            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
+
+            movdqa      xmm2,           xmm0
+            psubb       xmm0,           xmm1
+
+            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
+            pxor        xmm2,           [GLOBAL(t80)]
+            pcmpgtb     xmm1,           xmm2            ; obtain sign information
+
+            movdqa      xmm2,    xmm0
+            movdqa      xmm3,    xmm1
+            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa      XMMWORD PTR [rdi],   xmm0
+            movdqa      XMMWORD PTR [rdi +16], xmm2
+
+            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
+            movdqa      xmm5,           XMMWORD PTR [rax + 16]
+
+            movdqa      xmm6,           xmm4
+            psubb       xmm4,           xmm5
+
+            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
+            pxor        xmm6,           [GLOBAL(t80)]
+            pcmpgtb     xmm5,           xmm6            ; obtain sign information
+
+            movdqa      xmm6,    xmm4
+            movdqa      xmm7,    xmm5
+            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
+            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
+
+            movdqa      XMMWORD PTR [rdi +32], xmm4
+            movdqa      XMMWORD PTR [rdi +48], xmm6
+
+            add         rdi,            64
+            add         rax,            32
+            lea         rsi,            [rsi+rdx*2]
+
+            sub         rcx,            1
+            jnz         submby_loop
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_sse2)
+sym(vp8_subtract_mbuv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(1) ;z = usrc
+            add     rdi,        256*2  ;diff = diff + 256 (shorts)
+            add     rax,        256    ;Predictor = pred + 256
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            lea     rcx,        [rdx + rdx*2]
+
+            ;u
+            ;line 0 1
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi],   xmm0
+            movdqa     XMMWORD PTR [rdi +16],   xmm2
+
+            ;line 2 3
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 32],   xmm0
+            movdqa     XMMWORD PTR [rdi + 48],   xmm2
+
+            ;line 4 5
+            lea        rsi,     [rsi + rdx*4]
+
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 64],   xmm0
+            movdqa     XMMWORD PTR [rdi + 80],   xmm2
+
+            ;line 6 7
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 96],   xmm0
+            movdqa     XMMWORD PTR [rdi + 112],  xmm2
+
+            ;v
+            mov     rsi,        arg(2) ;z = vsrc
+            add     rdi,        64*2  ;diff = diff + 320 (shorts)
+            add     rax,        64    ;Predictor = pred + 320
+
+            ;line 0 1
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi],   xmm0
+            movdqa     XMMWORD PTR [rdi +16],   xmm2
+
+            ;line 2 3
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 32],   xmm0
+            movdqa     XMMWORD PTR [rdi + 48],   xmm2
+
+            ;line 4 5
+            lea        rsi,     [rsi + rdx*4]
+
+            movq       xmm0,    MMWORD PTR [rsi]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rdx]
+            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 64],   xmm0
+            movdqa     XMMWORD PTR [rdi + 80],   xmm2
+
+            ;line 6 7
+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
+            movq       xmm2,    MMWORD PTR [rsi+rcx]
+            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     XMMWORD PTR [rdi + 96],   xmm0
+            movdqa     XMMWORD PTR [rdi + 112],  xmm2
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t80:
+    times 16 db 0x80
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
index 173238e24..67a9b4d3e 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -498,7 +498,7 @@ sym(vp8_get4x4sse_cs_mmx):
         psrlq       mm7,    32
 
         paddd       mm0,    mm7
-        movd        rax,    mm0
+        movq        rax,    mm0
 
 
     ; begin epilog
@@ -556,7 +556,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx):
         pmullw          mm3,            [rax+8]             ;
 
         paddw           mm1,            mm3                 ;
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm1,            mmx_filter_shift    ;
         movq            mm5,            mm1
@@ -580,7 +580,7 @@ filter_block2d_bil4x4_var_mmx_loop:
         pmullw          mm3,            [rax+8]             ;
 
         paddw           mm1,            mm3                 ;
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm1,            mmx_filter_shift    ;
         movq            mm3,            mm5                 ;
@@ -592,7 +592,7 @@ filter_block2d_bil4x4_var_mmx_loop:
         paddw           mm1,            mm3                 ;
 
 
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
         psraw           mm1,            mmx_filter_shift    ;
 
         movd            mm3,            [rdi]               ;
@@ -710,10 +710,10 @@ sym(vp8_filter_block2d_bil_var_mmx):
         paddw           mm1,            mm3                 ;
 
         paddw           mm2,            mm4                 ;
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm2,            mmx_filter_shift    ;
         movq            mm5,            mm1
@@ -749,10 +749,10 @@ filter_block2d_bil_var_mmx_loop:
         paddw           mm1,            mm3                 ;
         paddw           mm2,            mm4                 ;
 
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
         psraw           mm1,            mmx_filter_shift    ;
 
-        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
         psraw           mm2,            mmx_filter_shift    ;
 
         movq            mm3,            mm5                 ;
@@ -773,8 +773,8 @@ filter_block2d_bil_var_mmx_loop:
         paddw           mm1,            mm3                 ;
         paddw           mm2,            mm4                 ;
 
-        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
-        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
 
         psraw           mm1,            mmx_filter_shift    ;
         psraw           mm2,            mmx_filter_shift    ;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index f47d9ccdd..cefa0a956 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -58,7 +58,7 @@ NEXTROW:
         movdqa      xmm3,xmm4
         psrldq      xmm4,4
         paddd       xmm4,xmm3
-        movd        rax,xmm4
+        movq        rax,xmm4
 
 
     ; begin epilog
@@ -471,7 +471,7 @@ sym(vp8_get8x8var_sse2):
         mov         rax,            arg(5) ;[Sum]
         mov         rdi,            arg(4) ;[SSE]
 
-        movd        rdx,            xmm7
+        movq        rdx,            xmm7
         movsx       rcx,            dx
 
         mov  dword ptr [rax],       ecx
@@ -532,7 +532,7 @@ sym(vp8_filter_block2d_bil_var_sse2):
         pmullw          xmm3,           [rax+16]             ;
         paddw           xmm1,           xmm3                 ;
 
-        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
         psraw           xmm1,           xmm_filter_shift    ;
 
         movdqa          xmm5,           xmm1
@@ -554,7 +554,7 @@ filter_block2d_bil_var_sse2_loop:
         pmullw          xmm3,           [rax+16]             ;
 
         paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
 
         psraw           xmm1,           xmm_filter_shift    ;
         movdqa          xmm3,           xmm5                 ;
@@ -565,7 +565,7 @@ filter_block2d_bil_var_sse2_loop:
         pmullw          xmm1,           [rdx+16]             ;
         paddw           xmm1,           xmm3                 ;
 
-        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
         psraw           xmm1,           xmm_filter_shift    ;
 
         movq            xmm3,           QWORD PTR [rdi]               ;
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 2600ce96b..2df73a635 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -15,7 +15,7 @@
 
 extern void filter_block1d_h6_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     unsigned short *output_ptr,
     unsigned int src_pixels_per_line,
     unsigned int pixel_step,
@@ -25,7 +25,7 @@ extern void filter_block1d_h6_mmx
 );
 extern void filter_block1d_v6_mmx
 (
-    short *src_ptr,
+    const short *src_ptr,
     unsigned char *output_ptr,
     unsigned int pixels_per_line,
     unsigned int pixel_step,
@@ -37,34 +37,34 @@ extern void filter_block1d_v6_mmx
 extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
 extern unsigned int vp8_get8x8var_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *SSE,
     int *Sum
 );
 extern unsigned int vp8_get4x4var_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *SSE,
     int *Sum
 );
 extern unsigned int vp8_get4x4sse_cs_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride
 );
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     const short *HFilter,
     const short *VFilter,
@@ -73,9 +73,9 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
 );
 extern void vp8_filter_block2d_bil_var_mmx
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     const short *HFilter,
@@ -126,9 +126,9 @@ void vp8_test_get_mb_ss(void)
 
 
 unsigned int vp8_get16x16var_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned *SSE,
     unsigned *SUM
@@ -157,9 +157,9 @@ unsigned int vp8_get16x16var_mmx(
 
 
 unsigned int vp8_variance4x4_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -173,9 +173,9 @@ unsigned int vp8_variance4x4_mmx(
 }
 
 unsigned int vp8_variance8x8_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -190,9 +190,9 @@ unsigned int vp8_variance8x8_mmx(
 }
 
 unsigned int vp8_mse16x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -212,9 +212,9 @@ unsigned int vp8_mse16x16_mmx(
 
 
 unsigned int vp8_variance16x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     int *sse)
 {
@@ -234,9 +234,9 @@ unsigned int vp8_variance16x16_mmx(
 }
 
 unsigned int vp8_variance16x8_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -255,9 +255,9 @@ unsigned int vp8_variance16x8_mmx(
 
 
 unsigned int vp8_variance8x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -296,11 +296,11 @@ DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
 
 unsigned int vp8_sub_pixel_variance4x4_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse)
 
@@ -320,11 +320,11 @@ unsigned int vp8_sub_pixel_variance4x4_mmx
 
 unsigned int vp8_sub_pixel_variance8x8_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -344,11 +344,11 @@ unsigned int vp8_sub_pixel_variance8x8_mmx
 
 unsigned int vp8_sub_pixel_variance16x16_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -383,11 +383,11 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
 }
 
 unsigned int vp8_sub_pixel_mse16x16_mmx(
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -398,11 +398,11 @@ unsigned int vp8_sub_pixel_mse16x16_mmx(
 
 unsigned int vp8_sub_pixel_variance16x8_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -435,11 +435,11 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
 
 unsigned int vp8_sub_pixel_variance8x16_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     int *sse
 )
@@ -457,9 +457,9 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
 }
 
 unsigned int vp8_i_variance16x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -480,9 +480,9 @@ unsigned int vp8_i_variance16x16_mmx(
 }
 
 unsigned int vp8_i_variance8x16_mmx(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -501,11 +501,11 @@ unsigned int vp8_i_variance8x16_mmx(
 
 unsigned int vp8_i_sub_pixel_variance16x16_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -560,11 +560,11 @@ unsigned int vp8_i_sub_pixel_variance16x16_mmx
 
 unsigned int vp8_i_sub_pixel_variance8x16_mmx
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -595,3 +595,39 @@ unsigned int vp8_i_sub_pixel_variance8x16_mmx
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 7));
 }
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
+                                           ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
+                                           ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
+                                           ref_ptr, recon_stride, sse);
+}
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 5e750ba2f..006e0a24a 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -13,16 +13,16 @@
 #include "pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
 
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     const short *HFilter,
     const short *VFilter,
@@ -32,9 +32,9 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
 
 extern unsigned int vp8_get4x4var_mmx
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *SSE,
     int *Sum
@@ -42,38 +42,38 @@ extern unsigned int vp8_get4x4var_mmx
 
 unsigned int vp8_get_mb_ss_sse2
 (
-    short *src_ptr
+    const short *src_ptr
 );
 unsigned int vp8_get16x16var_sse2
 (
-    unsigned char     *src_ptr,
-    int             source_stride,
-    unsigned char     *ref_ptr,
-    int             recon_stride,
-    unsigned int      *SSE,
-    int               *Sum
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
 );
 unsigned int vp8_get16x16pred_error_sse2
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_stride
 );
 unsigned int vp8_get8x8var_sse2
 (
-    unsigned char     *src_ptr,
-    int             source_stride,
-    unsigned char     *ref_ptr,
-    int             recon_stride,
-    unsigned int      *SSE,
-    int               *Sum
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
 );
 void vp8_filter_block2d_bil_var_sse2
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     const short *HFilter,
@@ -83,9 +83,9 @@ void vp8_filter_block2d_bil_var_sse2
 );
 void vp8_half_horiz_vert_variance16x_h_sse2
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     int *sum,
@@ -93,9 +93,9 @@ void vp8_half_horiz_vert_variance16x_h_sse2
 );
 void vp8_half_horiz_variance16x_h_sse2
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     int *sum,
@@ -103,9 +103,9 @@ void vp8_half_horiz_variance16x_h_sse2
 );
 void vp8_half_vert_variance16x_h_sse2
 (
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int ref_pixels_per_line,
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int src_pixels_per_line,
     unsigned int Height,
     int *sum,
@@ -115,9 +115,9 @@ void vp8_half_vert_variance16x_h_sse2
 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
 
 unsigned int vp8_variance4x4_wmt(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride)
 {
     unsigned int var;
@@ -132,9 +132,9 @@ unsigned int vp8_variance4x4_wmt(
 
 unsigned int vp8_variance8x8_wmt
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride)
 {
     unsigned int var;
@@ -149,9 +149,9 @@ unsigned int vp8_variance8x8_wmt
 
 unsigned int vp8_variance16x16_wmt
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -164,9 +164,9 @@ unsigned int vp8_variance16x16_wmt
     return (sse0 - ((sum0 * sum0) >> 8));
 }
 unsigned int vp8_mse16x16_wmt(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -182,9 +182,9 @@ unsigned int vp8_mse16x16_wmt(
 
 unsigned int vp8_variance16x8_wmt
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -203,9 +203,9 @@ unsigned int vp8_variance16x8_wmt
 
 unsigned int vp8_variance8x16_wmt
 (
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -239,11 +239,11 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
 };
 unsigned int vp8_sub_pixel_variance4x4_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -263,11 +263,11 @@ unsigned int vp8_sub_pixel_variance4x4_wmt
 
 unsigned int vp8_sub_pixel_variance8x8_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -288,11 +288,11 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
 
 unsigned int vp8_sub_pixel_variance16x16_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -364,11 +364,11 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
 }
 
 unsigned int vp8_sub_pixel_mse16x16_wmt(
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -379,11 +379,11 @@ unsigned int vp8_sub_pixel_mse16x16_wmt(
 
 unsigned int vp8_sub_pixel_variance16x8_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 
@@ -417,11 +417,11 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
 
 unsigned int vp8_sub_pixel_variance8x16_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -440,9 +440,9 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
 }
 
 unsigned int vp8_i_variance16x16_wmt(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -464,9 +464,9 @@ unsigned int vp8_i_variance16x16_wmt(
 }
 
 unsigned int vp8_i_variance8x16_wmt(
-    unsigned char *src_ptr,
+    const unsigned char *src_ptr,
     int  source_stride,
-    unsigned char *ref_ptr,
+    const unsigned char *ref_ptr,
     int  recon_stride,
     unsigned int *sse)
 {
@@ -486,11 +486,11 @@ unsigned int vp8_i_variance8x16_wmt(
 
 unsigned int vp8_i_sub_pixel_variance16x16_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -501,11 +501,11 @@ unsigned int vp8_i_sub_pixel_variance16x16_wmt
 
 unsigned int vp8_i_sub_pixel_variance8x16_wmt
 (
-    unsigned char  *src_ptr,
+    const unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
-    unsigned char *dst_ptr,
+    const unsigned char *dst_ptr,
     int dst_pixels_per_line,
     unsigned int *sse
 )
@@ -513,3 +513,84 @@ unsigned int vp8_i_sub_pixel_variance8x16_wmt
 
     return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
 }
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vp8_half_horiz_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    vp8_half_horiz_variance16x_h_sse2(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vp8_half_vert_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    vp8_half_vert_variance16x_h_sse2(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vp8_half_horiz_vert_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    vp8_half_horiz_vert_variance16x_h_sse2(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 3c9f9c790..6bea15ebc 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -35,6 +35,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx);
 extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_mmx);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_mmx);
 extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
 extern prototype_getmbss(vp8_get_mb_ss_mmx);
 extern prototype_variance(vp8_mse16x16_mmx);
@@ -89,6 +92,15 @@ extern prototype_sad(vp8_get4x4sse_cs_mmx);
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx
 
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_mmx
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_mmx
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_mmx
+
 #undef  vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx
 
@@ -130,6 +142,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt);
 extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_wmt);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_wmt);
 extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
 extern prototype_getmbss(vp8_get_mb_ss_sse2);
 extern prototype_variance(vp8_mse16x16_wmt);
@@ -183,6 +198,15 @@ extern prototype_variance2(vp8_get16x16var_sse2);
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt
 
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt
+
 #undef  vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt
 
@@ -273,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
 #endif
 #endif
 
+
+#if HAVE_SSE4_1
+extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
+extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad16x16x8
+#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
+
+#undef  vp8_variance_sad16x8x8
+#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
+
+#undef  vp8_variance_sad8x16x8
+#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
+
+#undef  vp8_variance_sad8x8x8
+#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
+
+#undef  vp8_variance_sad4x4x8
+#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
+
+#endif
+#endif
+
 #endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 18dc49cd4..fb1b37ccb 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -29,14 +29,14 @@ int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                  short *quant_ptr, short *dqcoeff_ptr);
 void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
 {
-    short *scan_mask    = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *qcoeff_ptr = d->qcoeff;
+    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+    short *coeff_ptr   = b->coeff;
+    short *zbin_ptr    = b->zbin;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant;
+    short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
+    short *dequant_ptr = d->dequant;
 
     d->eob = vp8_fast_quantize_b_impl_mmx(
                  coeff_ptr,
@@ -88,24 +88,22 @@ void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
     vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
 }
 
-int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  short *scan_mask, short *round_ptr,
                                  short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
-    short *scan_mask    = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *qcoeff_ptr = d->qcoeff;
+    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant;
+    short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
+    short *dequant_ptr = d->dequant;
 
-    d->eob = vp8_fast_quantize_b_impl_sse(
+    d->eob = vp8_fast_quantize_b_impl_sse2(
                  coeff_ptr,
-                 zbin_ptr,
                  qcoeff_ptr,
                  dequant_ptr,
                  scan_mask,
@@ -116,6 +114,7 @@ void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
              );
 }
 
+
 int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
                                short *qcoeff_ptr,short *dequant_ptr,
                                const int *default_zig_zag, short *round_ptr,
@@ -125,15 +124,15 @@ int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
 
 void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)
 {
-    short *zbin_boost_ptr = &b->zrun_zbin_boost[0];
-    short *coeff_ptr  = &b->coeff[0];
-    short *zbin_ptr   = &b->zbin[0][0];
-    short *round_ptr  = &b->round[0][0];
-    short *quant_ptr  = &b->quant[0][0];
-    short *qcoeff_ptr = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = &d->dequant[0][0];
-    short zbin_oq_value = b->zbin_extra;
+    short *zbin_boost_ptr = b->zrun_zbin_boost;
+    short *coeff_ptr      = b->coeff;
+    short *zbin_ptr       = b->zbin;
+    short *round_ptr      = b->round;
+    short *quant_ptr      = b->quant;
+    short *qcoeff_ptr     = d->qcoeff;
+    short *dqcoeff_ptr    = d->dqcoeff;
+    short *dequant_ptr    = d->dequant;
+    short zbin_oq_value   = b->zbin_extra;
 
     d->eob = vp8_regular_quantize_b_impl_sse2(
         coeff_ptr,
@@ -166,8 +165,39 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
     return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
 }
 
+void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+                             short *diff, unsigned char *predictor,
+                             int pitch);
+void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *z = *(be->base_src) + be->src;
+    unsigned int  src_stride = be->src_stride;
+    short *diff = &be->src_diff[0];
+    unsigned char *predictor = &bd->predictor[0];
+    vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSSE3
+int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+{
+    d->eob = vp8_fast_quantize_b_impl_ssse3(
+                    b->coeff,
+                    d->qcoeff,
+                    d->dequant,
+                    b->round,
+                    b->quant,
+                    d->dqcoeff
+               );
+}
 #endif
 
+
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -177,6 +207,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
     int wmt_enabled = flags & HAS_SSE2;
     int SSE3Enabled = flags & HAS_SSE3;
     int SSSE3Enabled = flags & HAS_SSSE3;
+    int SSE4_1Enabled = flags & HAS_SSE4_1;
 
     /* Note:
      *
@@ -187,7 +218,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 
     /* Override default functions with fastest ones for this CPU. */
 #if HAVE_MMX
-
     if (mmx_enabled)
     {
         cpi->rtcd.variance.sad16x16              = vp8_sad16x16_mmx;
@@ -207,6 +237,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_mmx;
         cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_mmx;
         cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_mmx;
+        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_mmx;
+        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_mmx;
+        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_mmx;
         cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_mmx;
 
         cpi->rtcd.variance.mse16x16              = vp8_mse16x16_mmx;
@@ -240,10 +273,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 
         /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/
     }
-
 #endif
-#if HAVE_SSE2
 
+#if HAVE_SSE2
     if (wmt_enabled)
     {
         cpi->rtcd.variance.sad16x16              = vp8_sad16x16_wmt;
@@ -263,6 +295,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_wmt;
         cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_wmt;
         cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_wmt;
+        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_wmt;
+        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_wmt;
+        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_wmt;
         cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_wmt;
 
         cpi->rtcd.variance.mse16x16              = vp8_mse16x16_wmt;
@@ -278,20 +313,21 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_sse2;
         cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_sse2;
 
-        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c ;
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2 ;
 
         cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;
         cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;
         cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
-        /* cpi->rtcd.encodemb.sub* not implemented for wmt */
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_sse2;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_sse2;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;
 
-        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse;
-        cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
+        /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
     }
-
 #endif
-#if HAVE_SSE3
 
+#if HAVE_SSE3
     if (SSE3Enabled)
     {
         cpi->rtcd.variance.sad16x16              = vp8_sad16x16_sse3;
@@ -309,16 +345,30 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
         cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
     }
-
 #endif
-#if HAVE_SSSE3
 
+#if HAVE_SSSE3
     if (SSSE3Enabled)
     {
         cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
         cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
+
     }
+#endif
 
+#if HAVE_SSE4_1
+    if (SSE4_1Enabled)
+    {
+        cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_sse4;
+        cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_sse4;
+        cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_sse4;
+        cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
+        cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
+        cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
+    }
 #endif
+
 #endif
 }
diff --git a/vp8/exports_dec b/vp8/exports_dec
index f9b985c86..100ac5c27 100644
--- a/vp8/exports_dec
+++ b/vp8/exports_dec
@@ -1 +1,2 @@
 data vpx_codec_vp8_dx_algo
+text vpx_codec_vp8_dx
diff --git a/vp8/exports_enc b/vp8/exports_enc
index 996701113..29ff35ef7 100644
--- a/vp8/exports_enc
+++ b/vp8/exports_enc
@@ -1 +1,2 @@
 data vpx_codec_vp8_cx_algo
+text vpx_codec_vp8_cx
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index ecca18a0a..bb3f8259c 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -112,22 +112,15 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
 endif
 
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
+
 # common (c)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/loopfilter_arm.c
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/recon_arm.c
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra4x4_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/systemdependent.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/vpx_asm_offsets.c
 
-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/filter_c.c
-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/idctllm.c
-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/recon.c
-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/reconintra4x4.c
-VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6)  += common/generic/systemdependent.c
-
 # common (armv6)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x4_v6$(ASM)
@@ -152,16 +145,10 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x8_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dc_only_idct_add_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/iwalsh_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilter_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilterhorizontaledge_uv_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilterhorizontaledge_y_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilterverticaledge_uv_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilterverticaledge_y_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilterhorizontaledge_uv_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilterhorizontaledge_y_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilterverticaledge_uv_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilterverticaledge_y_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilter_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon2b_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon4b_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/reconb_neon$(ASM)
@@ -174,6 +161,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon16x16mb_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c
 
 
 #
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 8845368fb..8e50b7f1b 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -14,6 +14,7 @@
 #include "vpx_version.h"
 #include "onyx_int.h"
 #include "vpx/vp8e.h"
+#include "vp8/encoder/firstpass.h"
 #include "onyx.h"
 #include <stdlib.h>
 #include <string.h>
@@ -63,9 +64,9 @@ static const struct extraconfig_map extracfg_map[] =
             0,                          /* Sharpness */
             0,                          /* static_thresh */
             VP8_ONE_TOKENPARTITION,     /* token_partitions */
-            0, /* arnr_max_frames */
-            0, /* arnr_strength */
-            0, /* arnr_type*/
+            0,                          /* arnr_max_frames */
+            3,                          /* arnr_strength */
+            3,                          /* arnr_type*/
         }
     }
 };
@@ -113,6 +114,11 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
             ERROR(#memb " out of range ["#lo".."#hi"]");\
     } while(0)
 
+#define RANGE_CHECK_HI(p,memb,hi) do {\
+        if(!((p)->memb <= (hi))) \
+            ERROR(#memb " out of range [.."#hi"]");\
+    } while(0)
+
 #define RANGE_CHECK_LO(p,memb,lo) do {\
         if(!((p)->memb >= (lo))) \
             ERROR(#memb " out of range ["#lo"..]");\
@@ -130,24 +136,24 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     RANGE_CHECK(cfg, g_h,                   2, 16384);
     RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
     RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
-    RANGE_CHECK(cfg, g_profile,             0, 3);
-    RANGE_CHECK(cfg, rc_min_quantizer,      0, 63);
-    RANGE_CHECK(cfg, rc_max_quantizer,      0, 63);
-    RANGE_CHECK(cfg, g_threads,             0, 64);
+    RANGE_CHECK_HI(cfg, g_profile,          3);
+    RANGE_CHECK_HI(cfg, rc_min_quantizer,   63);
+    RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
+    RANGE_CHECK_HI(cfg, g_threads,          64);
 #if !(CONFIG_REALTIME_ONLY)
-    RANGE_CHECK(cfg, g_lag_in_frames,       0, 25);
+    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #else
-    RANGE_CHECK(cfg, g_lag_in_frames,       0, 0);
+    RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
 #endif
     RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CBR);
-    RANGE_CHECK(cfg, rc_undershoot_pct,     0, 100);
-    RANGE_CHECK(cfg, rc_2pass_vbr_bias_pct, 0, 100);
+    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
+    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
     RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
     //RANGE_CHECK_BOOL(cfg,                 g_delete_firstpassfile);
     RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);
-    RANGE_CHECK(cfg, rc_dropframe_thresh,   0, 100);
-    RANGE_CHECK(cfg, rc_resize_up_thresh,   0, 100);
-    RANGE_CHECK(cfg, rc_resize_down_thresh, 0, 100);
+    RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);
+    RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
+    RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
 #if !(CONFIG_REALTIME_ONLY)
     RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
 #else
@@ -166,7 +172,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #if !(CONFIG_REALTIME_ONLY)
     RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
     RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
-    RANGE_CHECK(vp8_cfg, noise_sensitivity,  0, 6);
+    RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
 #else
     RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_REAL_TIME_ENCODING, VP8_REAL_TIME_ENCODING);
 
@@ -177,29 +183,32 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #endif
 
     RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
-    RANGE_CHECK(vp8_cfg, Sharpness,         0, 7);
-    RANGE_CHECK(vp8_cfg, arnr_max_frames,    0, 15);
-    RANGE_CHECK(vp8_cfg, arnr_strength,     0, 6);
-    RANGE_CHECK(vp8_cfg, arnr_type,         0, 0xffffffff);
+    RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
+    RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
+    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
+    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
 
     if (cfg->g_pass == VPX_RC_LAST_PASS)
     {
-        int n_doubles = cfg->rc_twopass_stats_in.sz / sizeof(double);
-        int n_packets = cfg->rc_twopass_stats_in.sz / sizeof(FIRSTPASS_STATS);
-        double frames;
+        int              mb_r = (cfg->g_h + 15) / 16;
+        int              mb_c = (cfg->g_w + 15) / 16;
+        size_t           packet_sz = vp8_firstpass_stats_sz(mb_r * mb_c);
+        int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
+        FIRSTPASS_STATS *stats;
 
         if (!cfg->rc_twopass_stats_in.buf)
             ERROR("rc_twopass_stats_in.buf not set.");
 
-        if (cfg->rc_twopass_stats_in.sz % sizeof(FIRSTPASS_STATS))
+        if (cfg->rc_twopass_stats_in.sz % packet_sz)
             ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
 
-        if (cfg->rc_twopass_stats_in.sz < 2 * sizeof(FIRSTPASS_STATS))
+        if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
             ERROR("rc_twopass_stats_in requires at least two packets.");
 
-        frames = ((double *)cfg->rc_twopass_stats_in.buf)[n_doubles - 1];
+        stats = (void*)((char *)cfg->rc_twopass_stats_in.buf
+                + (n_packets - 1) * packet_sz);
 
-        if ((int)(frames + 0.5) != n_packets - 1)
+        if ((int)(stats->count + 0.5) != n_packets - 1)
             ERROR("rc_twopass_stats_in missing EOS stats packet");
     }
 
@@ -754,12 +763,13 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
                 {
                     pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
 
-                    // TODO: ideally this timestamp should be as close as
-                    // possible to the prior PTS so that if a decoder uses
-                    // pts to schedule when to do this, we start right after
-                    // last frame was decoded.  Maybe should be set to
-                    // last time stamp. Invisible frames have no duration..
-                    pkt.data.frame.pts --;
+                    // This timestamp should be as close as possible to the
+                    // prior PTS so that if a decoder uses pts to schedule when
+                    // to do this, we start right after last frame was decoded.
+                    // Invisible frames have no duration.
+                    pkt.data.frame.pts = ((cpi->last_time_stamp_seen
+                        * ctx->cfg.g_timebase.den + round)
+                        / ctx->cfg.g_timebase.num / 10000000) + 1;
                     pkt.data.frame.duration = 0;
                 }
 
@@ -1074,7 +1084,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
 #ifndef VERSION_STRING
 #define VERSION_STRING
 #endif
-vpx_codec_iface_t vpx_codec_vp8_cx_algo =
+CODEC_INTERFACE(vpx_codec_vp8_cx) =
 {
     "WebM Project VP8 Encoder" VERSION_STRING,
     VPX_CODEC_INTERNAL_ABI_VERSION,
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index e7e535638..9964124d1 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -41,7 +41,7 @@ typedef enum
     VP8_SEG_ALG_PRIV     = 256,
     VP8_SEG_MAX
 } mem_seg_id_t;
-#define NELEMENTS(x) (sizeof(x)/sizeof(x[0]))
+#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
 
 static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
 
@@ -170,7 +170,7 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
     }
 }
 
-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, int id)
+static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
 {
     int i;
 
@@ -253,8 +253,11 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
                                    unsigned int           data_sz,
                                    vpx_codec_stream_info_t *si)
 {
-
     vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if(data + data_sz <= data)
+        res = VPX_CODEC_INVALID_PARAM;
+    else
     {
         /* Parse uncompresssed part of key frame header.
          * 3 bytes:- including version, frame type and an offset
@@ -269,14 +272,14 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
             const uint8_t *c = data + 3;
             si->is_kf = 1;
 
-            // vet via sync code
+            /* vet via sync code */
             if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
                 res = VPX_CODEC_UNSUP_BITSTREAM;
 
             si->w = swap2(*(const unsigned short *)(c + 3)) & 0x3fff;
             si->h = swap2(*(const unsigned short *)(c + 5)) & 0x3fff;
 
-            //printf("w=%d, h=%d\n", si->w, si->h);
+            /*printf("w=%d, h=%d\n", si->w, si->h);*/
             if (!(si->h | si->w))
                 res = VPX_CODEC_UNSUP_BITSTREAM;
         }
@@ -331,7 +334,10 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
 
     ctx->img_avail = 0;
 
-    /* Determine the stream parameters */
+    /* Determine the stream parameters. Note that we rely on peek_si to
+     * validate that we have a buffer that does not wrap around the top
+     * of the heap.
+     */
     if (!ctx->si.h)
         res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
 
@@ -653,7 +659,7 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
 #ifndef VERSION_STRING
 #define VERSION_STRING
 #endif
-vpx_codec_iface_t vpx_codec_vp8_dx_algo =
+CODEC_INTERFACE(vpx_codec_vp8_dx) =
 {
     "WebM Project VP8 Decoder" VERSION_STRING,
     VPX_CODEC_INTERNAL_ABI_VERSION,
@@ -670,7 +676,14 @@ vpx_codec_iface_t vpx_codec_vp8_dx_algo =
         vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
         vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
     },
-    {NOT_IMPLEMENTED} /* encoder functions */
+    { /* encoder functions */
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED
+    }
 };
 
 /*
@@ -693,5 +706,12 @@ vpx_codec_iface_t vpx_codec_vp8_algo =
         vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
         vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
     },
-    {NOT_IMPLEMENTED} /* encoder functions */
+    { /* encoder functions */
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED
+    }
 };
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 4ce18b6e7..683d785e6 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -82,6 +82,8 @@ VP8_CX_SRCS-yes += encoder/treewriter.c
 VP8_CX_SRCS-yes += encoder/variance_c.c
 VP8_CX_SRCS-$(CONFIG_PSNR) += common/postproc.h
 VP8_CX_SRCS-$(CONFIG_PSNR) += common/postproc.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.h
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
@@ -104,8 +106,11 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 1424bd15a..da27e0897 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -13,17 +13,21 @@
 
 #File list for arm
 # encoder
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/csystemdependent.c
+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c
 
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/boolhuff_arm.c
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/mcomp_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
 
-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV6)  += encoder/generic/csystemdependent.c
-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV7)  += encoder/boolhuff.c
-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV7)  += encoder/mcomp.c
+VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c
+
+#File list for armv5te
+# encoder
+VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
 
 #File list for armv6
 # encoder
@@ -44,10 +48,6 @@ VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_packtokens_armv7$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_packtokens_mbrow_armv7$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_packtokens_partitions_armv7$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/boolhuff_armv7$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
 
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/vpx_vp8_enc_asm_offsets.c
diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index 941961708..1acd67453 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -67,6 +67,8 @@ VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-yes += decoder/threading.c
 VP8_DX_SRCS-yes += decoder/idct_blk.c
+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h
+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c
 
 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
 
diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index 989232cd3..0803a9cb0 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -11,11 +11,9 @@
 
 #VP8_DX_SRCS list is modified according to different platforms.
 
+VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/arm_dsystemdependent.c
+
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/dequantize_arm.c
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/dsystemdependent.c
-VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6)  += decoder/generic/dsystemdependent.c
-VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6)  += decoder/dequantize.c
-VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6)  += decoder/idct_blk.c
 VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK)  += decoder/arm/detokenize$(ASM)
 
 #File list for armv6
@@ -25,7 +23,10 @@ VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c
 
 #File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)
 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index ab4cad10c..dcb451dca 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -389,6 +389,20 @@ struct vpx_codec_priv
 #define RECAST(id, x) id##__convert(x)
 
 
+/* CODEC_INTERFACE convenience macro
+ *
+ * By convention, each codec interface is a struct with extern linkage, where
+ * the symbol is suffixed with _algo. A getter function is also defined to
+ * return a pointer to the struct, since in some cases it's easier to work
+ * with text symbols than data symbols (see issue #169). This function has
+ * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
+ * macro is provided to define this getter function automatically.
+ */
+#define CODEC_INTERFACE(id)\
+vpx_codec_iface_t* id(void) { return &id##_algo; }\
+vpx_codec_iface_t  id##_algo
+
+
 /* Internal Utility Functions
  *
  * The following functions are indended to be used inside algorithms as
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index ddbd65484..10929590b 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -303,7 +303,7 @@ const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list
 
     pkt = (const void *) * iter;
 
-    if (pkt - list->pkts < list->cnt)
+    if ((size_t)(pkt - list->pkts) < list->cnt)
         *iter = pkt + 1;
     else
         pkt = NULL;
diff --git a/vpx/vp8.h b/vpx/vp8.h
index c7553ec22..d7ed8d8c1 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -53,7 +53,7 @@ enum vp8_postproc_level
     VP8_NOFILTERING    = 0,
     VP8_DEBLOCK        = 1,
     VP8_DEMACROBLOCK   = 2,
-    VP8_ADDNOISE       = 4,
+    VP8_ADDNOISE       = 4
 };
 
 /*!\brief post process flags
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index e1c821144..efd79459a 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -29,7 +29,8 @@
  * This interface provides the capability to encode raw VP8 streams, as would
  * be found in AVI files.
  */
-extern vpx_codec_iface_t vpx_codec_vp8_cx_algo;
+extern vpx_codec_iface_t  vpx_codec_vp8_cx_algo;
+extern vpx_codec_iface_t* vpx_codec_vp8_cx(void);
 
 
 /*
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 4cad838ff..fccd407f3 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -29,7 +29,8 @@
  * This interface provides the capability to decode raw VP8 streams, as would
  * be found in AVI files and other non-Flash uses.
  */
-extern vpx_codec_iface_t vpx_codec_vp8_dx_algo;
+extern vpx_codec_iface_t  vpx_codec_vp8_dx_algo;
+extern vpx_codec_iface_t* vpx_codec_vp8_dx(void);
 
 /* Include controls common to both the encoder and decoder */
 #include "vp8.h"
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 371df00c3..899b27cca 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -62,7 +62,7 @@ extern "C" {
     /*!\brief Decorator indicating a function is potentially unused */
 #ifdef UNUSED
 #elif __GNUC__
-#define UNUSED __attribute__ ((unused));
+#define UNUSED __attribute__ ((unused))
 #else
 #define UNUSED
 #endif
@@ -128,7 +128,7 @@ extern "C" {
         /*!\brief An iterator reached the end of list.
          *
          */
-        VPX_CODEC_LIST_END,
+        VPX_CODEC_LIST_END
 
     }
     vpx_codec_err_t;
diff --git a/vpx/vpx_decoder_compat.h b/vpx/vpx_decoder_compat.h
index 9e1e49222..ca6f61849 100644
--- a/vpx/vpx_decoder_compat.h
+++ b/vpx/vpx_decoder_compat.h
@@ -78,7 +78,7 @@ extern "C" {
         /*!\brief An iterator reached the end of list.
          *
          */
-        VPX_DEC_LIST_END = VPX_CODEC_LIST_END,
+        VPX_DEC_LIST_END = VPX_CODEC_LIST_END
 
     }
     vpx_dec_err_t;
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index f894cd871..3acb19945 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -171,7 +171,7 @@ extern "C" {
     {
         VPX_RC_ONE_PASS,   /**< Single pass mode */
         VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */
-        VPX_RC_LAST_PASS,  /**< Final pass of multi-pass mode */
+        VPX_RC_LAST_PASS   /**< Final pass of multi-pass mode */
     };
 
 
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 4506dd3b2..dcb8f31bc 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -55,7 +55,7 @@ extern "C" {
         VPX_IMG_FMT_YV12    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
         VPX_IMG_FMT_I420    = VPX_IMG_FMT_PLANAR | 2,
         VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */
-        VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,  /** < planar 4:2:0 format with vpx color space */
+        VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4   /** < planar 4:2:0 format with vpx color space */
     }
     vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
diff --git a/vpx_mem/include/vpx_mem_intrnl.h b/vpx_mem/include/vpx_mem_intrnl.h
index 00f9c90e1..6e261ba7f 100644
--- a/vpx_mem/include/vpx_mem_intrnl.h
+++ b/vpx_mem/include/vpx_mem_intrnl.h
@@ -15,24 +15,24 @@
 
 #ifndef CONFIG_MEM_MANAGER
 # if defined(VXWORKS)
-#  define CONFIG_MEM_MANAGER  1 //include heap manager functionality,
-//default: enabled on vxworks
+#  define CONFIG_MEM_MANAGER  1 /*include heap manager functionality,*/
+/*default: enabled on vxworks*/
 # else
-#  define CONFIG_MEM_MANAGER  0 //include heap manager functionality
+#  define CONFIG_MEM_MANAGER  0 /*include heap manager functionality*/
 # endif
 #endif /*CONFIG_MEM_MANAGER*/
 
 #ifndef CONFIG_MEM_TRACKER
-# define CONFIG_MEM_TRACKER     1 //include xvpx_* calls in the lib
+# define CONFIG_MEM_TRACKER     1 /*include xvpx_* calls in the lib*/
 #endif
 
 #ifndef CONFIG_MEM_CHECKS
-# define CONFIG_MEM_CHECKS      0 //include some basic safety checks in
-//vpx_memcpy, _memset, and _memmove
+# define CONFIG_MEM_CHECKS      0 /*include some basic safety checks in
+vpx_memcpy, _memset, and _memmove*/
 #endif
 
 #ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS   0  //use function pointers instead of compiled functions.
+# define USE_GLOBAL_FUNCTION_POINTERS   0  /*use function pointers instead of compiled functions.*/
 #endif
 
 #if CONFIG_MEM_TRACKER
@@ -46,9 +46,9 @@
 
 #ifndef DEFAULT_ALIGNMENT
 # if defined(VXWORKS)
-#  define DEFAULT_ALIGNMENT        32        //default addr alignment to use in
-//calls to vpx_* functions other
-//than vpx_memalign
+#  define DEFAULT_ALIGNMENT        32        /*default addr alignment to use in
+                                               calls to vpx_* functions other
+                                               than vpx_memalign*/
 # else
 #  define DEFAULT_ALIGNMENT        1
 # endif
@@ -59,24 +59,24 @@
 #endif
 
 #if CONFIG_MEM_TRACKER
-# define TRY_BOUNDS_CHECK         1         //when set to 1 pads each allocation,
-//integrity can be checked using
-//vpx_memory_tracker_check_integrity
-//or on free by defining
-//TRY_BOUNDS_CHECK_ON_FREE
+# define TRY_BOUNDS_CHECK         1        /*when set to 1 pads each allocation,
+                                             integrity can be checked using
+                                             vpx_memory_tracker_check_integrity
+                                             or on free by defining*/
+/*TRY_BOUNDS_CHECK_ON_FREE*/
 #else
 # define TRY_BOUNDS_CHECK         0
 #endif /*CONFIG_MEM_TRACKER*/
 
 #if TRY_BOUNDS_CHECK
-# define TRY_BOUNDS_CHECK_ON_FREE 0          //checks mem integrity on every
-//free, very expensive
-# define BOUNDS_CHECK_VALUE       0xdeadbeef //value stored before/after ea.
-//mem addr for bounds checking
-# define BOUNDS_CHECK_PAD_SIZE    32         //size of the padding before and
-//after ea allocation to be filled
-//with BOUNDS_CHECK_VALUE.
-//this should be a multiple of 4
+# define TRY_BOUNDS_CHECK_ON_FREE 0          /*checks mem integrity on every
+                                               free, very expensive*/
+# define BOUNDS_CHECK_VALUE       0xdeadbeef /*value stored before/after ea.
+                                               mem addr for bounds checking*/
+# define BOUNDS_CHECK_PAD_SIZE    32         /*size of the padding before and
+                                               after ea allocation to be filled
+                                               with BOUNDS_CHECK_VALUE.
+                                               this should be a multiple of 4*/
 #else
 # define BOUNDS_CHECK_VALUE       0
 # define BOUNDS_CHECK_PAD_SIZE    0
diff --git a/vpx_mem/vpx_mem.c b/vpx_mem/vpx_mem.c
index 85b05ab9f..eade43222 100644
--- a/vpx_mem/vpx_mem.c
+++ b/vpx_mem/vpx_mem.c
@@ -31,7 +31,7 @@ static unsigned long g_alloc_count = 0;
 # include "hmm_intrnl.h"
 
 # define SHIFT_HMM_ADDR_ALIGN_UNIT 5
-# define TOTAL_MEMORY_TO_ALLOCATE  20971520 // 20 * 1024 * 1024
+# define TOTAL_MEMORY_TO_ALLOCATE  20971520 /* 20 * 1024 * 1024 */
 
 # define MM_DYNAMIC_MEMORY 1
 # if MM_DYNAMIC_MEMORY
@@ -48,7 +48,7 @@ static int g_mng_memory_allocated = 0;
 
 static int vpx_mm_create_heap_memory();
 static void *vpx_mm_realloc(void *memblk, size_t size);
-#endif //CONFIG_MEM_MANAGER
+#endif /*CONFIG_MEM_MANAGER*/
 
 #if USE_GLOBAL_FUNCTION_POINTERS
 struct GLOBAL_FUNC_POINTERS
@@ -75,7 +75,7 @@ struct GLOBAL_FUNC_POINTERS
 # define VPX_MEMCPY_L  memcpy
 # define VPX_MEMSET_L  memset
 # define VPX_MEMMOVE_L memmove
-#endif // USE_GLOBAL_FUNCTION_POINTERS
+#endif /* USE_GLOBAL_FUNCTION_POINTERS */
 
 unsigned int vpx_mem_get_version()
 {
@@ -130,7 +130,7 @@ void *vpx_memalign(size_t align, size_t size)
     addr = hmm_alloc(&hmm_d, number_aau);
 #else
     addr = VPX_MALLOC_L(size + align - 1 + ADDRESS_STORAGE_SIZE);
-#endif //CONFIG_MEM_MANAGER
+#endif /*CONFIG_MEM_MANAGER*/
 
     if (addr)
     {
@@ -269,7 +269,7 @@ void *xvpx_memalign(size_t align, size_t size, char *file, int line)
     }
 #else
     x = vpx_memalign(align, size);
-#endif //TRY_BOUNDS_CHECK
+#endif /*TRY_BOUNDS_CHECK*/
 
     g_alloc_count++;
 
@@ -332,9 +332,10 @@ void *xvpx_realloc(void *memblk, size_t size, char *file, int line)
     vpx_memory_tracker_check_integrity(file, line);
 #endif
 
-    //have to do this regardless of success, because
-    //the memory that does get realloc'd may change
-    //the bounds values of this block
+    /* have to do this regardless of success, because
+     * the memory that does get realloc'd may change
+     * the bounds values of this block
+     */
     vpx_memory_tracker_remove((size_t)memblk);
 
 #if TRY_BOUNDS_CHECK
@@ -364,7 +365,7 @@ void *xvpx_realloc(void *memblk, size_t size, char *file, int line)
     }
 #else
     x = vpx_realloc(memblk, size);
-#endif //TRY_BOUNDS_CHECK
+#endif /*TRY_BOUNDS_CHECK*/
 
     if (!memblk) ++g_alloc_count;
 
@@ -380,7 +381,7 @@ void xvpx_free(void *p_address, char *file, int line)
 {
 #if TRY_BOUNDS_CHECK
     unsigned char *p_bounds_address = (unsigned char *)p_address;
-    //p_bounds_address -= BOUNDS_CHECK_PAD_SIZE;
+    /*p_bounds_address -= BOUNDS_CHECK_PAD_SIZE;*/
 #endif
 
 #if !TRY_BOUNDS_CHECK_ON_FREE
@@ -394,8 +395,9 @@ void xvpx_free(void *p_address, char *file, int line)
         vpx_memory_tracker_check_integrity(file, line);
 #endif
 
-        //if the addr isn't found in the list, assume it was allocated via
-        //vpx_ calls not xvpx_, therefore it does not contain any padding
+        /* if the addr isn't found in the list, assume it was allocated via
+         * vpx_ calls not xvpx_, therefore it does not contain any padding
+         */
         if (vpx_memory_tracker_remove((size_t)p_address) == -2)
         {
             p_bounds_address = p_address;
@@ -421,7 +423,7 @@ void xvpx_free(void *p_address, char *file, int line)
 
 #if CONFIG_MEM_CHECKS
 #if defined(VXWORKS)
-#include <task_lib.h> //for task_delay()
+#include <task_lib.h> /*for task_delay()*/
 /* This function is only used to get a stack trace of the player
 object so we can se where we are having a problem. */
 static int get_my_tt(int task)
@@ -627,7 +629,7 @@ static void *vpx_mm_realloc(void *memblk, size_t size)
 
     return p_ret;
 }
-#endif //CONFIG_MEM_MANAGER
+#endif /*CONFIG_MEM_MANAGER*/
 
 #if USE_GLOBAL_FUNCTION_POINTERS
 # if CONFIG_MEM_TRACKER
@@ -639,7 +641,7 @@ extern int vpx_memory_tracker_set_functions(g_malloc_func g_malloc_l
         , g_memset_func g_memset_l
         , g_memmove_func g_memmove_l);
 # endif
-#endif //USE_GLOBAL_FUNCTION_POINTERS
+#endif /*USE_GLOBAL_FUNCTION_POINTERS*/
 int vpx_mem_set_functions(g_malloc_func g_malloc_l
                           , g_calloc_func g_calloc_l
                           , g_realloc_func g_realloc_l
diff --git a/vpx_mem/vpx_mem.h b/vpx_mem/vpx_mem.h
index 31f8f9c60..749eaa42e 100644
--- a/vpx_mem/vpx_mem.h
+++ b/vpx_mem/vpx_mem.h
@@ -26,15 +26,15 @@
 /* end - vpx_mem version info */
 
 #ifndef VPX_TRACK_MEM_USAGE
-# define VPX_TRACK_MEM_USAGE       0  //enable memory tracking/integrity checks
+# define VPX_TRACK_MEM_USAGE       0  /* enable memory tracking/integrity checks */
 #endif
 #ifndef VPX_CHECK_MEM_FUNCTIONS
-# define VPX_CHECK_MEM_FUNCTIONS   0  //enable basic safety checks in _memcpy,
-//_memset, and _memmove
+# define VPX_CHECK_MEM_FUNCTIONS   0  /* enable basic safety checks in _memcpy,
+                                         _memset, and _memmove */
 #endif
 #ifndef REPLACE_BUILTIN_FUNCTIONS
-# define REPLACE_BUILTIN_FUNCTIONS 0  //replace builtin functions with their
-//vpx_ equivalents
+# define REPLACE_BUILTIN_FUNCTIONS 0  /* replace builtin functions with their
+                                         vpx_ equivalents */
 #endif
 
 #include <stdlib.h>
@@ -74,7 +74,7 @@ extern "C" {
     void *vpx_memset(void *dest, int val, size_t length);
     void *vpx_memmove(void *dest, const void *src, size_t count);
 
-// special memory functions
+    /* special memory functions */
     void *vpx_mem_alloc(int id, size_t size, size_t align);
     void vpx_mem_free(int id, void *mem, size_t size);
 
diff --git a/vpx_ports/arm.h b/vpx_ports/arm.h
new file mode 100644
index 000000000..81af1f11f
--- /dev/null
+++ b/vpx_ports/arm.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_PORTS_ARM_H
+#define VPX_PORTS_ARM_H
+#include <stdlib.h>
+#include "config.h"
+
+/*ARMv5TE "Enhanced DSP" instructions.*/
+#define HAS_EDSP  0x01
+/*ARMv6 "Parallel" or "Media" instructions.*/
+#define HAS_MEDIA 0x02
+/*ARMv7 optional NEON instructions.*/
+#define HAS_NEON  0x04
+
+int arm_cpu_caps(void);
+
+#endif
+
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
new file mode 100644
index 000000000..4109924cf
--- /dev/null
+++ b/vpx_ports/arm_cpudetect.c
@@ -0,0 +1,190 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "arm.h"
+
+static int arm_cpu_env_flags(int *flags)
+{
+    char *env;
+    env = getenv("VPX_SIMD_CAPS");
+    if (env && *env)
+    {
+        *flags = (int)strtol(env, NULL, 0);
+        return 0;
+    }
+    *flags = 0;
+    return -1;
+}
+
+static int arm_cpu_env_mask(void)
+{
+    char *env;
+    env = getenv("VPX_SIMD_CAPS_MASK");
+    return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+
+#if defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+#define WIN32_LEAN_AND_MEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+
+int arm_cpu_caps(void)
+{
+    int flags;
+    int mask;
+    if (!arm_cpu_env_flags(&flags))
+    {
+        return flags;
+    }
+    mask = arm_cpu_env_mask();
+    /* MSVC has no inline __asm support for ARM, but it does let you __emit
+     *  instructions via their assembled hex code.
+     * All of these instructions should be essentially nops.
+     */
+#if defined(HAVE_ARMV5TE)
+    if (mask & HAS_EDSP)
+    {
+        __try
+        {
+            /*PLD [r13]*/
+            __emit(0xF5DDF000);
+            flags |= HAS_EDSP;
+        }
+        __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION)
+        {
+            /*Ignore exception.*/
+        }
+    }
+#if defined(HAVE_ARMV6)
+    if (mask & HAS_MEDIA)
+        __try
+        {
+            /*SHADD8 r3,r3,r3*/
+            __emit(0xE6333F93);
+            flags |= HAS_MEDIA;
+        }
+        __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION)
+        {
+            /*Ignore exception.*/
+        }
+    }
+#if defined(HAVE_ARMV7)
+    if (mask & HAS_NEON)
+    {
+        __try
+        {
+            /*VORR q0,q0,q0*/
+            __emit(0xF2200150);
+            flags |= HAS_NEON;
+        }
+        __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION)
+        {
+            /*Ignore exception.*/
+        }
+    }
+#endif
+#endif
+#endif
+    return flags & mask;
+}
+
+#elif defined(__linux__)
+#include <stdio.h>
+
+int arm_cpu_caps(void)
+{
+    FILE *fin;
+    int flags;
+    int mask;
+    if (!arm_cpu_env_flags(&flags))
+    {
+        return flags;
+    }
+    mask = arm_cpu_env_mask();
+    /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
+     *  on Android.
+     * This also means that detection will fail in Scratchbox.
+     */
+    fin = fopen("/proc/cpuinfo","r");
+    if(fin != NULL)
+    {
+        /* 512 should be enough for anybody (it's even enough for all the flags
+         * that x86 has accumulated... so far).
+         */
+        char buf[512];
+        while (fgets(buf, 511, fin) != NULL)
+        {
+#if defined(HAVE_ARMV5TE) || defined(HAVE_ARMV7)
+            if (memcmp(buf, "Features", 8) == 0)
+            {
+                char *p;
+#if defined(HAVE_ARMV5TE)
+                p=strstr(buf, " edsp");
+                if (p != NULL && (p[5] == ' ' || p[5] == '\n'))
+                {
+                    flags |= HAS_EDSP;
+                }
+#if defined(HAVE_ARMV7)
+                p = strstr(buf, " neon");
+                if (p != NULL && (p[5] == ' ' || p[5] == '\n'))
+                {
+                    flags |= HAS_NEON;
+                }
+#endif
+#endif
+            }
+#endif
+#if defined(HAVE_ARMV6)
+            if (memcmp(buf, "CPU architecture:",17) == 0){
+                int version;
+                version = atoi(buf+17);
+                if (version >= 6)
+                {
+                    flags |= HAS_MEDIA;
+                }
+            }
+#endif
+        }
+        fclose(fin);
+    }
+    return flags & mask;
+}
+
+#elif !CONFIG_RUNTIME_CPU_DETECT
+
+int arm_cpu_caps(void)
+{
+    int flags;
+    int mask;
+    if (!arm_cpu_env_flags(&flags))
+    {
+        return flags;
+    }
+    mask = arm_cpu_env_mask();
+#if defined(HAVE_ARMV5TE)
+    flags |= HAS_EDSP;
+#endif
+#if defined(HAVE_ARMV6)
+    flags |= HAS_MEDIA;
+#endif
+#if defined(HAVE_ARMV7)
+    flags |= HAS_NEON;
+#endif
+    return flags & mask;
+}
+
+#else
+#error "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+ "available for your platform. Reconfigure without --enable-runtime-cpu-detect."
+#endif
diff --git a/vpx_ports/emms.asm b/vpx_ports/emms.asm
index 87eece84e..306e235ce 100644
--- a/vpx_ports/emms.asm
+++ b/vpx_ports/emms.asm
@@ -9,7 +9,7 @@
 ;
 
 
-%include "x86_abi_support.asm"
+%include "vpx_ports/x86_abi_support.asm"
 
 section .text
     global sym(vpx_reset_mmx_state)
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index a8e4607cd..190c8643a 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -14,6 +14,26 @@
 #include <stdlib.h>
 #include "config.h"
 
+typedef enum
+{
+    VPX_CPU_UNKNOWN = -1,
+    VPX_CPU_AMD,
+    VPX_CPU_AMD_OLD,
+    VPX_CPU_CENTAUR,
+    VPX_CPU_CYRIX,
+    VPX_CPU_INTEL,
+    VPX_CPU_NEXGEN,
+    VPX_CPU_NSC,
+    VPX_CPU_RISE,
+    VPX_CPU_SIS,
+    VPX_CPU_TRANSMETA,
+    VPX_CPU_TRANSMETA_OLD,
+    VPX_CPU_UMC,
+    VPX_CPU_VIA,
+
+    VPX_CPU_LAST
+}  vpx_cpu_t;
+
 #if defined(__GNUC__) && __GNUC__
 #if ARCH_X86_64
 #define cpuid(func,ax,bx,cx,dx)\
@@ -24,12 +44,11 @@
 #else
 #define cpuid(func,ax,bx,cx,dx)\
     __asm__ __volatile__ (\
-                          "pushl %%ebx     \n\t" \
-                          "cpuid           \n\t" \
-                          "movl  %%ebx, %1 \n\t" \
-                          "popl  %%ebx     \n\t" \
-                          : "=a" (ax), "=r" (bx), "=c" (cx), "=d" (dx) \
-                          : "a"  (func));
+                          "mov %%ebx, %%edi   \n\t" \
+                          "cpuid              \n\t" \
+                          "xchg %%edi, %%ebx  \n\t" \
+                          : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
+                          : "a" (func));
 #endif
 #else
 #if ARCH_X86_64
@@ -55,6 +74,7 @@ void __cpuid(int CPUInfo[4], int info_type);
 #define HAS_SSE2  0x04
 #define HAS_SSE3  0x08
 #define HAS_SSSE3 0x10
+#define HAS_SSE4_1 0x20
 #ifndef BIT
 #define BIT(n) (1<<n)
 #endif
@@ -98,9 +118,12 @@ x86_simd_caps(void)
 
     if (reg_ecx & BIT(9))  flags |= HAS_SSSE3;
 
+    if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+
     return flags & mask;
 }
 
+vpx_cpu_t vpx_x86_vendor(void);
 
 #if ARCH_X86_64 && defined(_MSC_VER)
 unsigned __int64 __rdtsc(void);
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index dc9e2d92c..a872b280e 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -36,6 +36,43 @@
 %define rsp esp
 %define rbp ebp
 %define movsxd mov
+%macro movq 2
+  %ifidn %1,eax
+    movd %1,%2
+  %elifidn %2,eax
+    movd %1,%2
+  %elifidn %1,ebx
+    movd %1,%2
+  %elifidn %2,ebx
+    movd %1,%2
+  %elifidn %1,ecx
+    movd %1,%2
+  %elifidn %2,ecx
+    movd %1,%2
+  %elifidn %1,edx
+    movd %1,%2
+  %elifidn %2,edx
+    movd %1,%2
+  %elifidn %1,esi
+    movd %1,%2
+  %elifidn %2,esi
+    movd %1,%2
+  %elifidn %1,edi
+    movd %1,%2
+  %elifidn %2,edi
+    movd %1,%2
+  %elifidn %1,esp
+    movd %1,%2
+  %elifidn %2,esp
+    movd %1,%2
+  %elifidn %1,ebp
+    movd %1,%2
+  %elifidn %2,ebp
+    movd %1,%2
+  %else
+    movq %1,%2
+  %endif
+%endmacro
 %endif
 
 
@@ -123,7 +160,7 @@
       ret
       %%exitGG:
       %undef GLOBAL
-      %define GLOBAL + %1 wrt ..gotoff
+      %define GLOBAL(x) x + %1 wrt ..gotoff
       %undef RESTORE_GOT
       %define RESTORE_GOT pop %1
     %endmacro
@@ -139,7 +176,7 @@
       ret
       %%exitGG:
       %undef GLOBAL
-      %define GLOBAL + %1 - fake_got
+      %define GLOBAL(x) x + %1 - fake_got
       %undef RESTORE_GOT
       %define RESTORE_GOT pop %1
     %endmacro
@@ -149,7 +186,7 @@
 %else
   %macro GET_GOT 1
   %endmacro
-  %define GLOBAL wrt rip
+  %define GLOBAL(x) rel x
   %ifidn __OUTPUT_FORMAT__,elf64
     %define WRT_PLT wrt ..plt
     %define HIDDEN_DATA(x) x:data hidden
@@ -160,7 +197,7 @@
 %ifnmacro GET_GOT
     %macro GET_GOT 1
     %endmacro
-    %define GLOBAL
+    %define GLOBAL(x) x
 %endif
 %ifndef RESTORE_GOT
 %define RESTORE_GOT
diff --git a/vpx_ports/x86_cpuid.c b/vpx_ports/x86_cpuid.c
new file mode 100644
index 000000000..ce6403374
--- /dev/null
+++ b/vpx_ports/x86_cpuid.c
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "x86.h"
+
+struct cpuid_vendors
+{
+    char vendor_string[12];
+    vpx_cpu_t vendor_id;
+};
+
+static struct cpuid_vendors cpuid_vendor_list[VPX_CPU_LAST] =
+{
+    { "AuthenticAMD", VPX_CPU_AMD           },
+    { "AMDisbetter!", VPX_CPU_AMD_OLD       },
+    { "CentaurHauls", VPX_CPU_CENTAUR       },
+    { "CyrixInstead", VPX_CPU_CYRIX         },
+    { "GenuineIntel", VPX_CPU_INTEL         },
+    { "NexGenDriven", VPX_CPU_NEXGEN        },
+    { "Geode by NSC", VPX_CPU_NSC           },
+    { "RiseRiseRise", VPX_CPU_RISE          },
+    { "SiS SiS SiS ", VPX_CPU_SIS           },
+    { "GenuineTMx86", VPX_CPU_TRANSMETA     },
+    { "TransmetaCPU", VPX_CPU_TRANSMETA_OLD },
+    { "UMC UMC UMC ", VPX_CPU_UMC           },
+    { "VIA VIA VIA ", VPX_CPU_VIA           },
+};
+
+vpx_cpu_t vpx_x86_vendor(void)
+{
+    unsigned int reg_eax;
+    unsigned int vs[3];
+    int i;
+
+    /* Get the Vendor String from the CPU */
+    cpuid(0, reg_eax, vs[0], vs[2], vs[1]);
+
+    for (i = 0; i < VPX_CPU_LAST; i++)
+    {
+        if (strncmp ((const char *)vs, cpuid_vendor_list[i].vendor_string, 12) == 0)
+           return (cpuid_vendor_list[i].vendor_id);
+    }
+
+    return VPX_CPU_UNKNOWN;
+}
diff --git a/vpx_scale/arm/scalesystemdependant.c b/vpx_scale/arm/scalesystemdependant.c
index 1e8bcb89d..fee76fff7 100644
--- a/vpx_scale/arm/scalesystemdependant.c
+++ b/vpx_scale/arm/scalesystemdependant.c
@@ -10,6 +10,7 @@
 
 
 #include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
 #include "vpx_scale/vpxscale.h"
 
 
@@ -47,6 +48,9 @@ extern void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CO
  ****************************************************************************/
 void vp8_scale_machine_specific_config()
 {
+#if HAVE_ARMV7 && CONFIG_RUNTIME_CPU_DETECT
+    int flags;
+#endif
     /*
     vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_armv4;
     vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_armv4;
@@ -73,14 +77,20 @@ void vp8_scale_machine_specific_config()
     vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
     */
 
-#if HAVE_ARMV7
-    vp8_yv12_extend_frame_borders_ptr      = vp8_yv12_extend_frame_borders_neon;
-    vp8_yv12_copy_frame_yonly_ptr          = vp8_yv12_copy_frame_yonly_neon;
-    vp8_yv12_copy_frame_ptr               = vp8_yv12_copy_frame_neon;
-#else
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
     vp8_yv12_extend_frame_borders_ptr      = vp8_yv12_extend_frame_borders;
     vp8_yv12_copy_frame_yonly_ptr          = vp8_yv12_copy_frame_yonly;
     vp8_yv12_copy_frame_ptr           = vp8_yv12_copy_frame;
 #endif
-
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+    flags = arm_cpu_caps();
+    if (flags & HAS_NEON)
+#endif
+    {
+        vp8_yv12_extend_frame_borders_ptr = vp8_yv12_extend_frame_borders_neon;
+        vp8_yv12_copy_frame_yonly_ptr     = vp8_yv12_copy_frame_yonly_neon;
+        vp8_yv12_copy_frame_ptr           = vp8_yv12_copy_frame_neon;
+    }
+#endif
 }
diff --git a/vpx_scale/generic/gen_scalers.c b/vpx_scale/generic/gen_scalers.c
index b084e817b..b54e334cb 100644
--- a/vpx_scale/generic/gen_scalers.c
+++ b/vpx_scale/generic/gen_scalers.c
@@ -116,7 +116,7 @@ void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitc
         des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
         des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
 
-        // First line in next band
+        /* First line in next band */
         a = des [dest_pitch * 5];
         des [dest_pitch * 4] = (unsigned char)((d * 205 + 51 * a + 128) >> 8);
 
@@ -163,7 +163,7 @@ void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest
         des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
         des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
 
-        // No other line for interplation of this line, so ..
+        /* No other line for interplation of this line, so .. */
         des[dest_pitch*4] = (unsigned char) d;
 
         des++;
@@ -401,7 +401,7 @@ void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitc
         des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
         des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
 
-        // First line in next band...
+        /* First line in next band... */
         a = des [dest_pitch * 5];
         des [dest_pitch * 4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
 
@@ -446,7 +446,7 @@ void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest
         des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
         des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
 
-        // No other line for interplation of this line, so ..
+        /* No other line for interplation of this line, so .. */
         des [ dest_pitch * 4 ] = (unsigned char)(c) ;
 
         des++;
@@ -549,7 +549,7 @@ void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitc
         c = des[dest_pitch*2];
         des [dest_pitch*2] = (unsigned char)((b + c + 1) >> 1);
 
-        // First line in next band...
+        /* First line in next band... */
         a = des [dest_pitch*4];
         des [dest_pitch*3] = (unsigned char)((c * 192 + a * 64 + 128) >> 8);
 
@@ -593,7 +593,7 @@ void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest
         c = des[dest_pitch*2];
         des [dest_pitch*2] = (unsigned char)((b + c + 1) >> 1);
 
-        // No other line for interplation of this line, so ..
+        /* No other line for interplation of this line, so .. */
         des [dest_pitch*3] = (unsigned char)(c);
 
         des++;
diff --git a/vpx_scale/generic/vpxscale.c b/vpx_scale/generic/vpxscale.c
index e1c80281f..13c9122f0 100644
--- a/vpx_scale/generic/vpxscale.c
+++ b/vpx_scale/generic/vpxscale.c
@@ -279,9 +279,9 @@ void scale1d_c
 
     (void) source_length;
 
-    // These asserts are needed if there are boundary issues...
-    //assert ( dest_scale > source_scale );
-    //assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );
+    /* These asserts are needed if there are boundary issues... */
+    /*assert ( dest_scale > source_scale );*/
+    /*assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );*/
 
     for (i = 0; i < dest_length * dest_step; i += dest_step)
     {
@@ -334,7 +334,7 @@ void scale1d_c
 static
 void Scale2D
 (
-    //const
+    /*const*/
     unsigned char *source,
     int source_pitch,
     unsigned int source_width,
@@ -352,7 +352,7 @@ void Scale2D
     unsigned int interlaced
 )
 {
-    //unsigned
+    /*unsigned*/
     int i, j, k;
     int bands;
     int dest_band_height;
@@ -370,7 +370,7 @@ void Scale2D
     int ratio_scalable = 1;
     int interpolation = 0;
 
-    unsigned char *source_base; // = (unsigned char *) ((source_pitch >= 0) ? source : (source + ((source_height-1) * source_pitch)));
+    unsigned char *source_base; /* = (unsigned char *) ((source_pitch >= 0) ? source : (source + ((source_height-1) * source_pitch))); */
     unsigned char *line_src;
 
 
@@ -386,24 +386,24 @@ void Scale2D
         source_base += offset;
     }
 
-    // find out the ratio for each direction
+    /* find out the ratio for each direction */
     switch (hratio * 10 / hscale)
     {
     case 8:
-        // 4-5 Scale in Width direction
+        /* 4-5 Scale in Width direction */
         horiz_line_scale = vp8_horizontal_line_5_4_scale;
         break;
     case 6:
-        // 3-5 Scale in Width direction
+        /* 3-5 Scale in Width direction */
         horiz_line_scale = vp8_horizontal_line_5_3_scale;
         break;
     case 5:
-        // 1-2 Scale in Width direction
+        /* 1-2 Scale in Width direction */
         horiz_line_scale = vp8_horizontal_line_2_1_scale;
         break;
     default:
-        // The ratio is not acceptable now
-        // throw("The ratio is not acceptable for now!");
+        /* The ratio is not acceptable now */
+        /* throw("The ratio is not acceptable for now!"); */
         ratio_scalable = 0;
         break;
     }
@@ -411,30 +411,30 @@ void Scale2D
     switch (vratio * 10 / vscale)
     {
     case 8:
-        // 4-5 Scale in vertical direction
+        /* 4-5 Scale in vertical direction */
         vert_band_scale     = vp8_vertical_band_5_4_scale;
         source_band_height  = 5;
         dest_band_height    = 4;
         break;
     case 6:
-        // 3-5 Scale in vertical direction
+        /* 3-5 Scale in vertical direction */
         vert_band_scale     = vp8_vertical_band_5_3_scale;
         source_band_height  = 5;
         dest_band_height    = 3;
         break;
     case 5:
-        // 1-2 Scale in vertical direction
+        /* 1-2 Scale in vertical direction */
 
         if (interlaced)
         {
-            //if the content is interlaced, point sampling is used
+            /* if the content is interlaced, point sampling is used */
             vert_band_scale     = vp8_vertical_band_2_1_scale;
         }
         else
         {
 
             interpolation = 1;
-            //if the content is progressive, interplo
+            /* if the content is progressive, interplo */
             vert_band_scale     = vp8_vertical_band_2_1_scale_i;
 
         }
@@ -443,8 +443,8 @@ void Scale2D
         dest_band_height    = 1;
         break;
     default:
-        // The ratio is not acceptable now
-        // throw("The ratio is not acceptable for now!");
+        /* The ratio is not acceptable now */
+        /* throw("The ratio is not acceptable for now!"); */
         ratio_scalable = 0;
         break;
     }
@@ -453,7 +453,7 @@ void Scale2D
     {
         if (source_height == dest_height)
         {
-            // for each band of the image
+            /* for each band of the image */
             for (k = 0; k < (int)dest_height; k++)
             {
                 horiz_line_scale(source, source_width, dest, dest_width);
@@ -474,10 +474,10 @@ void Scale2D
 
         for (k = 0; k < (int)(dest_height + dest_band_height - 1) / dest_band_height; k++)
         {
-            // scale one band horizontally
+            /* scale one band horizontally */
             for (i = 0; i < source_band_height; i++)
             {
-                // Trap case where we could read off the base of the source buffer
+                /* Trap case where we could read off the base of the source buffer */
 
                 line_src = (unsigned char *)source + i * source_pitch;
 
@@ -488,13 +488,13 @@ void Scale2D
                                  temp_area + (i + 1)*dest_pitch, dest_width);
             }
 
-            // Vertical scaling is in place
+            /* Vertical scaling is in place */
             vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
 
             if (interpolation)
                 vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
 
-            // Next band...
+            /* Next band... */
             source += (unsigned long) source_band_height  * source_pitch;
             dest   += (unsigned long) dest_band_height * dest_pitch;
         }
@@ -515,7 +515,7 @@ void Scale2D
 
     if (source_height == dest_height)
     {
-        // for each band of the image
+        /* for each band of the image */
         for (k = 0; k < (int)dest_height; k++)
         {
             Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
@@ -537,15 +537,15 @@ void Scale2D
         dest_band_height   = source_band_height * vratio / vscale;
     }
 
-    // first row needs to be done so that we can stay one row ahead for vertical zoom
+    /* first row needs to be done so that we can stay one row ahead for vertical zoom */
     Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
 
-    // for each band of the image
+    /* for each band of the image */
     bands = (dest_height + dest_band_height - 1) / dest_band_height;
 
     for (k = 0; k < bands; k++)
     {
-        // scale one band horizontally
+        /* scale one band horizontally */
         for (i = 1; i < source_band_height + 1; i++)
         {
             if (k * source_band_height + i < (int) source_height)
@@ -553,24 +553,24 @@ void Scale2D
                 Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
                          temp_area + i * dest_pitch, 1, hratio, dest_width);
             }
-            else  //  Duplicate the last row
+            else  /*  Duplicate the last row */
             {
-                // copy temp_area row 0 over from last row in the past
+                /* copy temp_area row 0 over from last row in the past */
                 duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
             }
         }
 
-        // scale one band vertically
+        /* scale one band vertically */
         for (j = 0; j < (int)dest_width; j++)
         {
             Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
                      &dest[j], dest_pitch, vratio, dest_band_height);
         }
 
-        // copy temp_area row 0 over from last row in the past
+        /* copy temp_area row 0 over from last row in the past */
         duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
 
-        // move to the next band
+        /* move to the next band */
         source += source_band_height * source_pitch;
         dest   += dest_band_height * dest_pitch;
     }
@@ -617,7 +617,7 @@ void vp8_scale_frame
     int dw = (hscale - 1 + src->y_width * hratio) / hscale;
     int dh = (vscale - 1 + src->y_height * vratio) / vscale;
 
-    // call our internal scaling routines!!
+    /* call our internal scaling routines!! */
     Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
             (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
             temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
@@ -696,13 +696,13 @@ int any_ratio_2d_scale
     unsigned int src_band_height  = 0;
     unsigned int dest_band_height = 0;
 
-    // suggested scale factors
+    /* suggested scale factors */
     int hs = si->HScale;
     int hr = si->HRatio;
     int vs = si->VScale;
     int vr = si->VRatio;
 
-    // assume the ratios are scalable instead of should be centered
+    /* assume the ratios are scalable instead of should be centered */
     int ratio_scalable = 1;
 
     const unsigned char *source_base = ((source_pitch >= 0) ? source : (source + ((source_height - 1) * source_pitch)));
@@ -714,37 +714,37 @@ int any_ratio_2d_scale
 
     (void) si;
 
-    // find out the ratio for each direction
+    /* find out the ratio for each direction */
     switch (hr * 30 / hs)
     {
     case 24:
-        // 4-5 Scale in Width direction
+        /* 4-5 Scale in Width direction */
         horiz_line_scale = vp8_horizontal_line_4_5_scale;
         break;
     case 22:
-        // 3-4 Scale in Width direction
+        /* 3-4 Scale in Width direction */
         horiz_line_scale = vp8_horizontal_line_3_4_scale;
         break;
 
     case 20:
-        // 4-5 Scale in Width direction
+        /* 4-5 Scale in Width direction */
         horiz_line_scale = vp8_horizontal_line_2_3_scale;
         break;
     case 18:
-        // 3-5 Scale in Width direction
+        /* 3-5 Scale in Width direction */
         horiz_line_scale = vp8_horizontal_line_3_5_scale;
         break;
     case 15:
-        // 1-2 Scale in Width direction
+        /* 1-2 Scale in Width direction */
         horiz_line_scale = vp8_horizontal_line_1_2_scale;
         break;
     case 30:
-        // no scale in Width direction
+        /* no scale in Width direction */
         horiz_line_scale = horizontal_line_copy;
         break;
     default:
-        // The ratio is not acceptable now
-        // throw("The ratio is not acceptable for now!");
+        /* The ratio is not acceptable now */
+        /* throw("The ratio is not acceptable for now!"); */
         ratio_scalable = 0;
         break;
     }
@@ -752,50 +752,50 @@ int any_ratio_2d_scale
     switch (vr * 30 / vs)
     {
     case 24:
-        // 4-5 Scale in vertical direction
+        /* 4-5 Scale in vertical direction */
         vert_band_scale     = vp8_vertical_band_4_5_scale;
         last_vert_band_scale = vp8_last_vertical_band_4_5_scale;
         src_band_height     = 4;
         dest_band_height    = 5;
         break;
     case 22:
-        // 3-4 Scale in vertical direction
+        /* 3-4 Scale in vertical direction */
         vert_band_scale     = vp8_vertical_band_3_4_scale;
         last_vert_band_scale = vp8_last_vertical_band_3_4_scale;
         src_band_height     = 3;
         dest_band_height    = 4;
         break;
     case 20:
-        // 2-3 Scale in vertical direction
+        /* 2-3 Scale in vertical direction */
         vert_band_scale     = vp8_vertical_band_2_3_scale;
         last_vert_band_scale = vp8_last_vertical_band_2_3_scale;
         src_band_height     = 2;
         dest_band_height    = 3;
         break;
     case 18:
-        // 3-5 Scale in vertical direction
+        /* 3-5 Scale in vertical direction */
         vert_band_scale     = vp8_vertical_band_3_5_scale;
         last_vert_band_scale = vp8_last_vertical_band_3_5_scale;
         src_band_height     = 3;
         dest_band_height    = 5;
         break;
     case 15:
-        // 1-2 Scale in vertical direction
+        /* 1-2 Scale in vertical direction */
         vert_band_scale     = vp8_vertical_band_1_2_scale;
         last_vert_band_scale = vp8_last_vertical_band_1_2_scale;
         src_band_height     = 1;
         dest_band_height    = 2;
         break;
     case 30:
-        // no scale in Width direction
+        /* no scale in Width direction */
         vert_band_scale     = null_scale;
         last_vert_band_scale = null_scale;
         src_band_height     = 4;
         dest_band_height    = 4;
         break;
     default:
-        // The ratio is not acceptable now
-        // throw("The ratio is not acceptable for now!");
+        /* The ratio is not acceptable now */
+        /* throw("The ratio is not acceptable for now!"); */
         ratio_scalable = 0;
         break;
     }
@@ -805,13 +805,13 @@ int any_ratio_2d_scale
 
     horiz_line_scale(source, source_width, dest, dest_width);
 
-    // except last band
+    /* except last band */
     for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height - 1; k++)
     {
-        // scale one band horizontally
+        /* scale one band horizontally */
         for (i = 1; i < src_band_height; i++)
         {
-            // Trap case where we could read off the base of the source buffer
+            /* Trap case where we could read off the base of the source buffer */
             line_src = source + i * source_pitch;
 
             if (line_src < source_base)
@@ -821,8 +821,8 @@ int any_ratio_2d_scale
                              dest + i * dest_pitch, dest_width);
         }
 
-        // first line of next band
-        // Trap case where we could read off the base of the source buffer
+        /* first line of next band */
+        /* Trap case where we could read off the base of the source buffer */
         line_src = source + src_band_height * source_pitch;
 
         if (line_src < source_base)
@@ -832,18 +832,18 @@ int any_ratio_2d_scale
                          dest + dest_band_height * dest_pitch,
                          dest_width);
 
-        // Vertical scaling is in place
+        /* Vertical scaling is in place */
         vert_band_scale(dest, dest_pitch, dest_width);
 
-        // Next band...
+        /* Next band... */
         source += src_band_height  * source_pitch;
         dest   += dest_band_height * dest_pitch;
     }
 
-    // scale one band horizontally
+    /* scale one band horizontally */
     for (i = 1; i < src_band_height; i++)
     {
-        // Trap case where we could read off the base of the source buffer
+        /* Trap case where we could read off the base of the source buffer */
         line_src = source + i * source_pitch;
 
         if (line_src < source_base)
@@ -854,7 +854,7 @@ int any_ratio_2d_scale
                          dest_width);
     }
 
-    // Vertical scaling is in place
+    /* Vertical scaling is in place */
     last_vert_band_scale(dest, dest_pitch, dest_width);
 
     return ratio_scalable;
@@ -885,7 +885,7 @@ int any_ratio_frame_scale(SCALE_VARS *scale_vars, int YOffset, int UVOffset)
     int ew;
     int eh;
 
-    // suggested scale factors
+    /* suggested scale factors */
     int hs = scale_vars->HScale;
     int hr = scale_vars->HRatio;
     int vs = scale_vars->VScale;
@@ -968,11 +968,11 @@ center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_con
     unsigned char *src_data_pointer;
     unsigned char *dst_data_pointer;
 
-    // center values
+    /* center values */
     row_offset = (dst_yuv_config->y_height - src_yuv_config->y_height) / 2;
     col_offset = (dst_yuv_config->y_width - src_yuv_config->y_width) / 2;
 
-    // Y's
+    /* Y's */
     src_data_pointer = src_yuv_config->y_buffer;
     dst_data_pointer = (unsigned char *)dst_yuv_config->y_buffer + (row_offset * dst_yuv_config->y_stride) + col_offset;
 
@@ -986,7 +986,7 @@ center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_con
     row_offset /= 2;
     col_offset /= 2;
 
-    // U's
+    /* U's */
     src_data_pointer = src_yuv_config->u_buffer;
     dst_data_pointer = (unsigned char *)dst_yuv_config->u_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
 
@@ -997,7 +997,7 @@ center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_con
         src_data_pointer += src_yuv_config->uv_stride;
     }
 
-    // V's
+    /* V's */
     src_data_pointer = src_yuv_config->v_buffer;
     dst_data_pointer = (unsigned char *)dst_yuv_config->v_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
 
@@ -1040,8 +1040,8 @@ vp8_yv12_scale_or_center
     int VRatio
 )
 {
-//    if ( ppi->post_processing_level )
-    //      update_umvborder ( ppi, frame_buffer );
+    /*if ( ppi->post_processing_level )
+          update_umvborder ( ppi, frame_buffer );*/
 
 
     switch (scaling_mode)
@@ -1050,12 +1050,12 @@ vp8_yv12_scale_or_center
     case MAINTAIN_ASPECT_RATIO:
     {
         SCALE_VARS scale_vars;
-        // center values
+        /* center values */
 #if 1
         int row = (dst_yuv_config->y_height - expanded_frame_height) / 2;
         int col = (dst_yuv_config->y_width  - expanded_frame_width) / 2;
-//        int YOffset  = row * dst_yuv_config->y_width + col;
-//        int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1);
+        /*int YOffset  = row * dst_yuv_config->y_width + col;
+        int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1);*/
         int YOffset  = row * dst_yuv_config->y_stride + col;
         int UVOffset = (row >> 1) * dst_yuv_config->uv_stride + (col >> 1);
 #else
@@ -1074,7 +1074,7 @@ vp8_yv12_scale_or_center
         scale_vars.expanded_frame_width = expanded_frame_width;
         scale_vars.expanded_frame_height = expanded_frame_height;
 
-        // perform center and scale
+        /* perform center and scale */
         any_ratio_frame_scale(&scale_vars, YOffset, UVOffset);
 
         break;
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 3034cc3a7..d9d228551 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -45,7 +45,7 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf)
 int
 vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border)
 {
-//NOTE:
+/*NOTE:*/
 
     int yplane_size = (height + 2 * border) * (width + 2 * border);
     int uvplane_size = ((1 + height) / 2 + border) * ((1 + width) / 2 + border);
@@ -65,9 +65,10 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int
         ybf->border = border;
         ybf->frame_size = yplane_size + 2 * uvplane_size;
 
-        // Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
-        // when we have a large motion vector in V on the last v block.
-        // Note : We never use these pixels anyway so this doesn't hurt.
+        /* Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
+         * when we have a large motion vector in V on the last v block.
+         * Note : We never use these pixels anyway so this doesn't hurt.
+         */
         ybf->buffer_alloc = (unsigned char *) duck_memalign(32,  ybf->frame_size + (ybf->y_stride * 2) + 32, 0);
 
         if (ybf->buffer_alloc == NULL)
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 39d2fa854..e58aa1fb2 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -40,7 +40,7 @@ vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
     plane_height = ybf->y_height;
     plane_width = ybf->y_width;
 
-    // copy the left and right most columns out
+    /* copy the left and right most columns out */
     src_ptr1 = ybf->y_buffer;
     src_ptr2 = src_ptr1 + plane_width - 1;
     dest_ptr1 = src_ptr1 - Border;
@@ -56,7 +56,7 @@ vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
         dest_ptr2 += plane_stride;
     }
 
-    // Now copy the top and bottom source lines into each line of the respective borders
+    /* Now copy the top and bottom source lines into each line of the respective borders */
     src_ptr1 = ybf->y_buffer - Border;
     src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
     dest_ptr1 = src_ptr1 - (Border * plane_stride);
@@ -79,7 +79,7 @@ vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
     plane_width = ybf->uv_width;
     Border /= 2;
 
-    // copy the left and right most columns out
+    /* copy the left and right most columns out */
     src_ptr1 = ybf->u_buffer;
     src_ptr2 = src_ptr1 + plane_width - 1;
     dest_ptr1 = src_ptr1 - Border;
@@ -95,7 +95,7 @@ vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
         dest_ptr2 += plane_stride;
     }
 
-    // Now copy the top and bottom source lines into each line of the respective borders
+    /* Now copy the top and bottom source lines into each line of the respective borders */
     src_ptr1 = ybf->u_buffer - Border;
     src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
     dest_ptr1 = src_ptr1 - (Border * plane_stride);
@@ -113,7 +113,7 @@ vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
     /* V Plane */
     /***********/
 
-    // copy the left and right most columns out
+    /* copy the left and right most columns out */
     src_ptr1 = ybf->v_buffer;
     src_ptr2 = src_ptr1 + plane_width - 1;
     dest_ptr1 = src_ptr1 - Border;
@@ -129,7 +129,7 @@ vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
         dest_ptr2 += plane_stride;
     }
 
-    // Now copy the top and bottom source lines into each line of the respective borders
+    /* Now copy the top and bottom source lines into each line of the respective borders */
     src_ptr1 = ybf->v_buffer - Border;
     src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
     dest_ptr1 = src_ptr1 - (Border * plane_stride);
@@ -165,7 +165,7 @@ vp8_yv12_extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf)
     plane_height = ybf->y_height;
     plane_width = ybf->y_width;
 
-    // copy the left and right most columns out
+    /* copy the left and right most columns out */
     src_ptr1 = ybf->y_buffer;
     src_ptr2 = src_ptr1 + plane_width - 1;
     dest_ptr1 = src_ptr1 - Border;
@@ -181,7 +181,7 @@ vp8_yv12_extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf)
         dest_ptr2 += plane_stride;
     }
 
-    // Now copy the top and bottom source lines into each line of the respective borders
+    /* Now copy the top and bottom source lines into each line of the respective borders */
     src_ptr1 = ybf->y_buffer - Border;
     src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
     dest_ptr1 = src_ptr1 - (Border * plane_stride);
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 50d6e3b3a..5dcee818a 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -32,8 +32,8 @@ extern "C"
     ************************************/
     typedef enum
     {
-        REG_YUV = 0,    // Regular yuv
-        INT_YUV = 1     // The type of yuv that can be tranfer to and from RGB through integer transform
+        REG_YUV = 0,    /* Regular yuv */
+        INT_YUV = 1     /* The type of yuv that can be tranfer to and from RGB through integer transform */
               }
               YUV_TYPE;
 
@@ -42,12 +42,12 @@ extern "C"
         int   y_width;
         int   y_height;
         int   y_stride;
-//    int   yinternal_width;
+/*    int   yinternal_width; */
 
         int   uv_width;
         int   uv_height;
         int   uv_stride;
-//    int   uvinternal_width;
+/*    int   uvinternal_width; */
 
         unsigned char *y_buffer;
         unsigned char *u_buffer;
@@ -68,4 +68,4 @@ extern "C"
 #endif
 
 
-#endif //YV12_CONFIG_H
+#endif /*YV12_CONFIG_H*/
diff --git a/vpxdec.c b/vpxdec.c
new file mode 100644
index 000000000..9b565b022
--- /dev/null
+++ b/vpxdec.c
@@ -0,0 +1,1043 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* This is a simple program that reads ivf files and decodes them
+ * using the new interface. Decoded frames are output as YV12 raw.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <limits.h>
+#if defined(_WIN32)
+#include <io.h>
+#define snprintf _snprintf
+#define isatty   _isatty
+#define fileno   _fileno
+#else
+#include <unistd.h>
+#endif
+#define VPX_CODEC_DISABLE_COMPAT 1
+#include "vpx_config.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx_ports/vpx_timer.h"
+#if CONFIG_VP8_DECODER
+#include "vpx/vp8dx.h"
+#endif
+#if CONFIG_MD5
+#include "md5_utils.h"
+#endif
+#include "tools_common.h"
+#include "nestegg/include/nestegg/nestegg.h"
+
+#ifndef PATH_MAX
+#define PATH_MAX 256
+#endif
+
+static const char *exec_name;
+
+#define VP8_FOURCC (0x00385056)
+static const struct
+{
+    char const *name;
+    const vpx_codec_iface_t *iface;
+    unsigned int             fourcc;
+    unsigned int             fourcc_mask;
+} ifaces[] =
+{
+#if CONFIG_VP8_DECODER
+    {"vp8",  &vpx_codec_vp8_dx_algo,   VP8_FOURCC, 0x00FFFFFF},
+#endif
+};
+
+#include "args.h"
+static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1,
+                                  "Codec to use");
+static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0,
+                                  "Output raw YV12 frames");
+static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0,
+                                  "Output raw I420 frames");
+static const arg_def_t flipuvarg = ARG_DEF(NULL, "flipuv", 0,
+                                   "Flip the chroma planes in the output");
+static const arg_def_t noblitarg = ARG_DEF(NULL, "noblit", 0,
+                                   "Don't process the decoded frames");
+static const arg_def_t progressarg = ARG_DEF(NULL, "progress", 0,
+                                     "Show progress after each frame decodes");
+static const arg_def_t limitarg = ARG_DEF(NULL, "limit", 1,
+                                  "Stop decoding after n frames");
+static const arg_def_t postprocarg = ARG_DEF(NULL, "postproc", 0,
+                                     "Postprocess decoded frames");
+static const arg_def_t summaryarg = ARG_DEF(NULL, "summary", 0,
+                                    "Show timing summary");
+static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
+                                    "Output file name pattern (see below)");
+static const arg_def_t threadsarg = ARG_DEF("t", "threads", 1,
+                                    "Max threads to use");
+static const arg_def_t verbosearg = ARG_DEF("v", "verbose", 0,
+                                  "Show version string");
+
+#if CONFIG_MD5
+static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
+                                        "Compute the MD5 sum of the decoded frame");
+#endif
+static const arg_def_t *all_args[] =
+{
+    &codecarg, &use_yv12, &use_i420, &flipuvarg, &noblitarg,
+    &progressarg, &limitarg, &postprocarg, &summaryarg, &outputfile,
+    &threadsarg, &verbosearg,
+#if CONFIG_MD5
+    &md5arg,
+#endif
+    NULL
+};
+
+#if CONFIG_VP8_DECODER
+static const arg_def_t addnoise_level = ARG_DEF(NULL, "noise-level", 1,
+                                        "Enable VP8 postproc add noise");
+static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0,
+                                 "Enable VP8 deblocking");
+static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level", 1,
+        "Enable VP8 demacroblocking, w/ level");
+static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
+                                       "Enable VP8 visible debug info");
+
+
+static const arg_def_t *vp8_pp_args[] =
+{
+    &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
+    NULL
+};
+#endif
+
+static void usage_exit()
+{
+    int i;
+
+    fprintf(stderr, "Usage: %s <options> filename\n\n"
+            "Options:\n", exec_name);
+    arg_show_usage(stderr, all_args);
+#if CONFIG_VP8_DECODER
+    fprintf(stderr, "\nVP8 Postprocessing Options:\n");
+    arg_show_usage(stderr, vp8_pp_args);
+#endif
+    fprintf(stderr,
+            "\nOutput File Patterns:\n\n"
+            "  The -o argument specifies the name of the file(s) to "
+            "write to. If the\n  argument does not include any escape "
+            "characters, the output will be\n  written to a single file. "
+            "Otherwise, the filename will be calculated by\n  expanding "
+            "the following escape characters:\n"
+            "\n\t%%w   - Frame width"
+            "\n\t%%h   - Frame height"
+            "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
+            "\n\n  Pattern arguments are only supported in conjunction "
+            "with the --yv12 and\n  --i420 options. If the -o option is "
+            "not specified, the output will be\n  directed to stdout.\n"
+            );
+    fprintf(stderr, "\nIncluded decoders:\n\n");
+
+    for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
+        fprintf(stderr, "    %-6s - %s\n",
+                ifaces[i].name,
+                vpx_codec_iface_name(ifaces[i].iface));
+
+    exit(EXIT_FAILURE);
+}
+
+void die(const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\n");
+    usage_exit();
+}
+
+static unsigned int mem_get_le16(const void *vmem)
+{
+    unsigned int  val;
+    const unsigned char *mem = (const unsigned char *)vmem;
+
+    val = mem[1] << 8;
+    val |= mem[0];
+    return val;
+}
+
+static unsigned int mem_get_le32(const void *vmem)
+{
+    unsigned int  val;
+    const unsigned char *mem = (const unsigned char *)vmem;
+
+    val = mem[3] << 24;
+    val |= mem[2] << 16;
+    val |= mem[1] << 8;
+    val |= mem[0];
+    return val;
+}
+
+enum file_kind
+{
+    RAW_FILE,
+    IVF_FILE,
+    WEBM_FILE
+};
+
+struct input_ctx
+{
+    enum file_kind  kind;
+    FILE           *infile;
+    nestegg        *nestegg_ctx;
+    nestegg_packet *pkt;
+    unsigned int    chunk;
+    unsigned int    chunks;
+    unsigned int    video_track;
+};
+
+#define IVF_FRAME_HDR_SZ (sizeof(uint32_t) + sizeof(uint64_t))
+#define RAW_FRAME_HDR_SZ (sizeof(uint32_t))
+static int read_frame(struct input_ctx      *input,
+                      uint8_t               **buf,
+                      size_t                *buf_sz,
+                      size_t                *buf_alloc_sz)
+{
+    char            raw_hdr[IVF_FRAME_HDR_SZ];
+    size_t          new_buf_sz;
+    FILE           *infile = input->infile;
+    enum file_kind  kind = input->kind;
+    if(kind == WEBM_FILE)
+    {
+        if(input->chunk >= input->chunks)
+        {
+            unsigned int track;
+
+            do
+            {
+                /* End of this packet, get another. */
+                if(input->pkt)
+                    nestegg_free_packet(input->pkt);
+
+                if(nestegg_read_packet(input->nestegg_ctx, &input->pkt) <= 0
+                   || nestegg_packet_track(input->pkt, &track))
+                    return 1;
+
+            } while(track != input->video_track);
+
+            if(nestegg_packet_count(input->pkt, &input->chunks))
+                return 1;
+            input->chunk = 0;
+        }
+
+        if(nestegg_packet_data(input->pkt, input->chunk, buf, buf_sz))
+            return 1;
+        input->chunk++;
+
+        return 0;
+    }
+    /* For both the raw and ivf formats, the frame size is the first 4 bytes
+     * of the frame header. We just need to special case on the header
+     * size.
+     */
+    else if (fread(raw_hdr, kind==IVF_FILE
+                   ? IVF_FRAME_HDR_SZ : RAW_FRAME_HDR_SZ, 1, infile) != 1)
+    {
+        if (!feof(infile))
+            fprintf(stderr, "Failed to read frame size\n");
+
+        new_buf_sz = 0;
+    }
+    else
+    {
+        new_buf_sz = mem_get_le32(raw_hdr);
+
+        if (new_buf_sz > 256 * 1024 * 1024)
+        {
+            fprintf(stderr, "Error: Read invalid frame size (%u)\n",
+                    (unsigned int)new_buf_sz);
+            new_buf_sz = 0;
+        }
+
+        if (kind == RAW_FILE && new_buf_sz > 256 * 1024)
+            fprintf(stderr, "Warning: Read invalid frame size (%u)"
+                    " - not a raw file?\n", (unsigned int)new_buf_sz);
+
+        if (new_buf_sz > *buf_alloc_sz)
+        {
+            uint8_t *new_buf = realloc(*buf, 2 * new_buf_sz);
+
+            if (new_buf)
+            {
+                *buf = new_buf;
+                *buf_alloc_sz = 2 * new_buf_sz;
+            }
+            else
+            {
+                fprintf(stderr, "Failed to allocate compressed data buffer\n");
+                new_buf_sz = 0;
+            }
+        }
+    }
+
+    *buf_sz = new_buf_sz;
+
+    if (*buf_sz)
+    {
+        if (fread(*buf, 1, *buf_sz, infile) != *buf_sz)
+        {
+            fprintf(stderr, "Failed to read full frame\n");
+            return 1;
+        }
+
+        return 0;
+    }
+
+    return 1;
+}
+
+void *out_open(const char *out_fn, int do_md5)
+{
+    void *out = NULL;
+
+    if (do_md5)
+    {
+#if CONFIG_MD5
+        MD5Context *md5_ctx = out = malloc(sizeof(MD5Context));
+        (void)out_fn;
+        MD5Init(md5_ctx);
+#endif
+    }
+    else
+    {
+        FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb")
+                                                  : set_binary_mode(stdout);
+
+        if (!outfile)
+        {
+            fprintf(stderr, "Failed to output file");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    return out;
+}
+
+void out_put(void *out, const uint8_t *buf, unsigned int len, int do_md5)
+{
+    if (do_md5)
+    {
+#if CONFIG_MD5
+        MD5Update(out, buf, len);
+#endif
+    }
+    else
+    {
+        if(fwrite(buf, 1, len, out));
+    }
+}
+
+void out_close(void *out, const char *out_fn, int do_md5)
+{
+    if (do_md5)
+    {
+#if CONFIG_MD5
+        uint8_t md5[16];
+        int i;
+
+        MD5Final(md5, out);
+        free(out);
+
+        for (i = 0; i < 16; i++)
+            printf("%02x", md5[i]);
+
+        printf("  %s\n", out_fn);
+#endif
+    }
+    else
+    {
+        fclose(out);
+    }
+}
+
+unsigned int file_is_ivf(FILE *infile,
+                         unsigned int *fourcc,
+                         unsigned int *width,
+                         unsigned int *height,
+                         unsigned int *fps_den,
+                         unsigned int *fps_num)
+{
+    char raw_hdr[32];
+    int is_ivf = 0;
+
+    if (fread(raw_hdr, 1, 32, infile) == 32)
+    {
+        if (raw_hdr[0] == 'D' && raw_hdr[1] == 'K'
+            && raw_hdr[2] == 'I' && raw_hdr[3] == 'F')
+        {
+            is_ivf = 1;
+
+            if (mem_get_le16(raw_hdr + 4) != 0)
+                fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
+                        " decode properly.");
+
+            *fourcc = mem_get_le32(raw_hdr + 8);
+            *width = mem_get_le16(raw_hdr + 12);
+            *height = mem_get_le16(raw_hdr + 14);
+            *fps_num = mem_get_le32(raw_hdr + 16);
+            *fps_den = mem_get_le32(raw_hdr + 20);
+
+            /* Some versions of vpxenc used 1/(2*fps) for the timebase, so
+             * we can guess the framerate using only the timebase in this
+             * case. Other files would require reading ahead to guess the
+             * timebase, like we do for webm.
+             */
+            if(*fps_num < 1000)
+            {
+                /* Correct for the factor of 2 applied to the timebase in the
+                 * encoder.
+                 */
+                if(*fps_num&1)*fps_den<<=1;
+                else *fps_num>>=1;
+            }
+            else
+            {
+                /* Don't know FPS for sure, and don't have readahead code
+                 * (yet?), so just default to 30fps.
+                 */
+                *fps_num = 30;
+                *fps_den = 1;
+            }
+        }
+    }
+
+    if (!is_ivf)
+        rewind(infile);
+
+    return is_ivf;
+}
+
+
+unsigned int file_is_raw(FILE *infile,
+                         unsigned int *fourcc,
+                         unsigned int *width,
+                         unsigned int *height,
+                         unsigned int *fps_den,
+                         unsigned int *fps_num)
+{
+    unsigned char buf[32];
+    int is_raw = 0;
+    vpx_codec_stream_info_t si;
+
+    if (fread(buf, 1, 32, infile) == 32)
+    {
+        int i;
+
+        if(mem_get_le32(buf) < 256 * 1024 * 1024)
+            for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
+                if(!vpx_codec_peek_stream_info(ifaces[i].iface,
+                                               buf + 4, 32 - 4, &si))
+                {
+                    is_raw = 1;
+                    *fourcc = ifaces[i].fourcc;
+                    *width = si.w;
+                    *height = si.h;
+                    *fps_num = 30;
+                    *fps_den = 1;
+                    break;
+                }
+    }
+
+    rewind(infile);
+    return is_raw;
+}
+
+
+static int
+nestegg_read_cb(void *buffer, size_t length, void *userdata)
+{
+    FILE *f = userdata;
+
+    if(fread(buffer, 1, length, f) < length)
+    {
+        if (ferror(f))
+            return -1;
+        if (feof(f))
+            return 0;
+    }
+    return 1;
+}
+
+
+static int
+nestegg_seek_cb(int64_t offset, int whence, void * userdata)
+{
+    switch(whence) {
+        case NESTEGG_SEEK_SET: whence = SEEK_SET; break;
+        case NESTEGG_SEEK_CUR: whence = SEEK_CUR; break;
+        case NESTEGG_SEEK_END: whence = SEEK_END; break;
+    };
+    return fseek(userdata, offset, whence)? -1 : 0;
+}
+
+
+static int64_t
+nestegg_tell_cb(void * userdata)
+{
+    return ftell(userdata);
+}
+
+
+static void
+nestegg_log_cb(nestegg * context, unsigned int severity, char const * format,
+               ...)
+{
+    va_list ap;
+
+    va_start(ap, format);
+    vfprintf(stderr, format, ap);
+    fprintf(stderr, "\n");
+    va_end(ap);
+}
+
+
+static int
+webm_guess_framerate(struct input_ctx *input,
+                     unsigned int     *fps_den,
+                     unsigned int     *fps_num)
+{
+    unsigned int i;
+    uint64_t     tstamp=0;
+
+    /* Guess the framerate. Read up to 1 second, or 50 video packets,
+     * whichever comes first.
+     */
+    for(i=0; tstamp < 1000000000 && i < 50;)
+    {
+        nestegg_packet * pkt;
+        unsigned int track;
+
+        if(nestegg_read_packet(input->nestegg_ctx, &pkt) <= 0)
+            break;
+
+        nestegg_packet_track(pkt, &track);
+        if(track == input->video_track)
+        {
+            nestegg_packet_tstamp(pkt, &tstamp);
+            i++;
+        }
+
+        nestegg_free_packet(pkt);
+    }
+
+    if(nestegg_track_seek(input->nestegg_ctx, input->video_track, 0))
+        goto fail;
+
+    *fps_num = (i - 1) * 1000000;
+    *fps_den = tstamp / 1000;
+    return 0;
+fail:
+    nestegg_destroy(input->nestegg_ctx);
+    input->nestegg_ctx = NULL;
+    rewind(input->infile);
+    return 1;
+}
+
+
+static int
+file_is_webm(struct input_ctx *input,
+             unsigned int     *fourcc,
+             unsigned int     *width,
+             unsigned int     *height,
+             unsigned int     *fps_den,
+             unsigned int     *fps_num)
+{
+    unsigned int i, n;
+    int          track_type = -1;
+    uint64_t     tstamp=0;
+
+    nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
+                     input->infile};
+    nestegg_video_params params;
+    nestegg_packet * pkt;
+
+    if(nestegg_init(&input->nestegg_ctx, io, NULL))
+        goto fail;
+
+    if(nestegg_track_count(input->nestegg_ctx, &n))
+        goto fail;
+
+    for(i=0; i<n; i++)
+    {
+        track_type = nestegg_track_type(input->nestegg_ctx, i);
+
+        if(track_type == NESTEGG_TRACK_VIDEO)
+            break;
+        else if(track_type < 0)
+            goto fail;
+    }
+
+    if(nestegg_track_codec_id(input->nestegg_ctx, i) != NESTEGG_CODEC_VP8)
+    {
+        fprintf(stderr, "Not VP8 video, quitting.\n");
+        exit(1);
+    }
+
+    input->video_track = i;
+
+    if(nestegg_track_video_params(input->nestegg_ctx, i, &params))
+        goto fail;
+
+    *fps_den = 0;
+    *fps_num = 0;
+    *fourcc = VP8_FOURCC;
+    *width = params.width;
+    *height = params.height;
+    return 1;
+fail:
+    input->nestegg_ctx = NULL;
+    rewind(input->infile);
+    return 0;
+}
+
+
+void show_progress(int frame_in, int frame_out, unsigned long dx_time)
+{
+    fprintf(stderr, "%d decoded frames/%d showed frames in %lu us (%.2f fps)\r",
+            frame_in, frame_out, dx_time,
+            (float)frame_out * 1000000.0 / (float)dx_time);
+}
+
+
+void generate_filename(const char *pattern, char *out, size_t q_len,
+                       unsigned int d_w, unsigned int d_h,
+                       unsigned int frame_in)
+{
+    const char *p = pattern;
+    char *q = out;
+
+    do
+    {
+        char *next_pat = strchr(p, '%');
+
+        if(p == next_pat)
+        {
+            size_t pat_len;
+
+            // parse the pattern
+            q[q_len - 1] = '\0';
+            switch(p[1])
+            {
+            case 'w': snprintf(q, q_len - 1, "%d", d_w); break;
+            case 'h': snprintf(q, q_len - 1, "%d", d_h); break;
+            case '1': snprintf(q, q_len - 1, "%d", frame_in); break;
+            case '2': snprintf(q, q_len - 1, "%02d", frame_in); break;
+            case '3': snprintf(q, q_len - 1, "%03d", frame_in); break;
+            case '4': snprintf(q, q_len - 1, "%04d", frame_in); break;
+            case '5': snprintf(q, q_len - 1, "%05d", frame_in); break;
+            case '6': snprintf(q, q_len - 1, "%06d", frame_in); break;
+            case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
+            case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
+            case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
+            default:
+                die("Unrecognized pattern %%%c\n", p[1]);
+            }
+
+            pat_len = strlen(q);
+            if(pat_len >= q_len - 1)
+                die("Output filename too long.\n");
+            q += pat_len;
+            p += 2;
+            q_len -= pat_len;
+        }
+        else
+        {
+            size_t copy_len;
+
+            // copy the next segment
+            if(!next_pat)
+                copy_len = strlen(p);
+            else
+                copy_len = next_pat - p;
+
+            if(copy_len >= q_len - 1)
+                die("Output filename too long.\n");
+
+            memcpy(q, p, copy_len);
+            q[copy_len] = '\0';
+            q += copy_len;
+            p += copy_len;
+            q_len -= copy_len;
+        }
+    } while(*p);
+}
+
+
+int main(int argc, const char **argv_)
+{
+    vpx_codec_ctx_t          decoder;
+    char                  *fn = NULL;
+    int                    i;
+    uint8_t               *buf = NULL;
+    size_t                 buf_sz = 0, buf_alloc_sz = 0;
+    FILE                  *infile;
+    int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0, do_md5 = 0, progress = 0;
+    int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
+    vpx_codec_iface_t       *iface = NULL;
+    unsigned int           fourcc;
+    unsigned long          dx_time = 0;
+    struct arg               arg;
+    char                   **argv, **argi, **argj;
+    const char             *outfile_pattern = 0;
+    char                    outfile[PATH_MAX];
+    int                     single_file;
+    int                     use_y4m = 1;
+    unsigned int            width;
+    unsigned int            height;
+    unsigned int            fps_den;
+    unsigned int            fps_num;
+    void                   *out = NULL;
+    vpx_codec_dec_cfg_t     cfg = {0};
+#if CONFIG_VP8_DECODER
+    vp8_postproc_cfg_t      vp8_pp_cfg = {0};
+#endif
+    struct input_ctx        input = {0};
+
+    /* Parse command line */
+    exec_name = argv_[0];
+    argv = argv_dup(argc - 1, argv_ + 1);
+
+    for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
+    {
+        memset(&arg, 0, sizeof(arg));
+        arg.argv_step = 1;
+
+        if (arg_match(&arg, &codecarg, argi))
+        {
+            int j, k = -1;
+
+            for (j = 0; j < sizeof(ifaces) / sizeof(ifaces[0]); j++)
+                if (!strcmp(ifaces[j].name, arg.val))
+                    k = j;
+
+            if (k >= 0)
+                iface = ifaces[k].iface;
+            else
+                die("Error: Unrecognized argument (%s) to --codec\n",
+                    arg.val);
+        }
+        else if (arg_match(&arg, &outputfile, argi))
+            outfile_pattern = arg.val;
+        else if (arg_match(&arg, &use_yv12, argi))
+        {
+            use_y4m = 0;
+            flipuv = 1;
+        }
+        else if (arg_match(&arg, &use_i420, argi))
+        {
+            use_y4m = 0;
+            flipuv = 0;
+        }
+        else if (arg_match(&arg, &flipuvarg, argi))
+            flipuv = 1;
+        else if (arg_match(&arg, &noblitarg, argi))
+            noblit = 1;
+        else if (arg_match(&arg, &progressarg, argi))
+            progress = 1;
+        else if (arg_match(&arg, &limitarg, argi))
+            stop_after = arg_parse_uint(&arg);
+        else if (arg_match(&arg, &postprocarg, argi))
+            postproc = 1;
+        else if (arg_match(&arg, &md5arg, argi))
+            do_md5 = 1;
+        else if (arg_match(&arg, &summaryarg, argi))
+            summary = 1;
+        else if (arg_match(&arg, &threadsarg, argi))
+            cfg.threads = arg_parse_uint(&arg);
+        else if (arg_match(&arg, &verbosearg, argi))
+            quiet = 0;
+
+#if CONFIG_VP8_DECODER
+        else if (arg_match(&arg, &addnoise_level, argi))
+        {
+            postproc = 1;
+            vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE;
+            vp8_pp_cfg.noise_level = arg_parse_uint(&arg);
+        }
+        else if (arg_match(&arg, &demacroblock_level, argi))
+        {
+            postproc = 1;
+            vp8_pp_cfg.post_proc_flag |= VP8_DEMACROBLOCK;
+            vp8_pp_cfg.deblocking_level = arg_parse_uint(&arg);
+        }
+        else if (arg_match(&arg, &deblock, argi))
+        {
+            postproc = 1;
+            vp8_pp_cfg.post_proc_flag |= VP8_DEBLOCK;
+        }
+        else if (arg_match(&arg, &pp_debug_info, argi))
+        {
+            unsigned int level = arg_parse_uint(&arg);
+
+            postproc = 1;
+            vp8_pp_cfg.post_proc_flag &= ~0x7;
+
+            if (level)
+                vp8_pp_cfg.post_proc_flag |= level;
+        }
+
+#endif
+        else
+            argj++;
+    }
+
+    /* Check for unrecognized options */
+    for (argi = argv; *argi; argi++)
+        if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+            die("Error: Unrecognized option %s\n", *argi);
+
+    /* Handle non-option arguments */
+    fn = argv[0];
+
+    if (!fn)
+        usage_exit();
+
+    /* Open file */
+    infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
+
+    if (!infile)
+    {
+        fprintf(stderr, "Failed to open file '%s'",
+                strcmp(fn, "-") ? fn : "stdin");
+        return EXIT_FAILURE;
+    }
+
+    /* Make sure we don't dump to the terminal, unless forced to with -o - */
+    if(!outfile_pattern && isatty(fileno(stdout)) && !do_md5 && !noblit)
+    {
+        fprintf(stderr,
+                "Not dumping raw video to your terminal. Use '-o -' to "
+                "override.\n");
+        return EXIT_FAILURE;
+    }
+
+    input.infile = infile;
+    if(file_is_ivf(infile, &fourcc, &width, &height, &fps_den,
+                   &fps_num))
+        input.kind = IVF_FILE;
+    else if(file_is_webm(&input, &fourcc, &width, &height, &fps_den, &fps_num))
+        input.kind = WEBM_FILE;
+    else if(file_is_raw(infile, &fourcc, &width, &height, &fps_den, &fps_num))
+        input.kind = RAW_FILE;
+    else
+    {
+        fprintf(stderr, "Unrecognized input file type.\n");
+        return EXIT_FAILURE;
+    }
+
+    /* If the output file is not set or doesn't have a sequence number in
+     * it, then we only open it once.
+     */
+    outfile_pattern = outfile_pattern ? outfile_pattern : "-";
+    single_file = 1;
+    {
+        const char *p = outfile_pattern;
+        do
+        {
+            p = strchr(p, '%');
+            if(p && p[1] >= '1' && p[1] <= '9')
+            {
+                // pattern contains sequence number, so it's not unique.
+                single_file = 0;
+                break;
+            }
+            if(p)
+                p++;
+        } while(p);
+    }
+
+    if(single_file && !noblit)
+    {
+        generate_filename(outfile_pattern, outfile, sizeof(outfile)-1,
+                          width, height, 0);
+        out = out_open(outfile, do_md5);
+    }
+
+    if (use_y4m && !noblit)
+    {
+        char buffer[128];
+        if (!single_file)
+        {
+            fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
+                            " try --i420 or --yv12.\n");
+            return EXIT_FAILURE;
+        }
+
+        if(input.kind == WEBM_FILE)
+            if(webm_guess_framerate(&input, &fps_den, &fps_num))
+            {
+                fprintf(stderr, "Failed to guess framerate -- error parsing "
+                                "webm file?\n");
+                return EXIT_FAILURE;
+            }
+
+
+        /*Note: We can't output an aspect ratio here because IVF doesn't
+           store one, and neither does VP8.
+          That will have to wait until these tools support WebM natively.*/
+        sprintf(buffer, "YUV4MPEG2 C%s W%u H%u F%u:%u I%c\n",
+                "420jpeg", width, height, fps_num, fps_den, 'p');
+        out_put(out, (unsigned char *)buffer, strlen(buffer), do_md5);
+    }
+
+    /* Try to determine the codec from the fourcc. */
+    for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
+        if ((fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc)
+        {
+            vpx_codec_iface_t  *ivf_iface = ifaces[i].iface;
+
+            if (iface && iface != ivf_iface)
+                fprintf(stderr, "Notice -- IVF header indicates codec: %s\n",
+                        ifaces[i].name);
+            else
+                iface = ivf_iface;
+
+            break;
+        }
+
+    if (vpx_codec_dec_init(&decoder, iface ? iface :  ifaces[0].iface, &cfg,
+                           postproc ? VPX_CODEC_USE_POSTPROC : 0))
+    {
+        fprintf(stderr, "Failed to initialize decoder: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (!quiet)
+        fprintf(stderr, "%s\n", decoder.name);
+
+#if CONFIG_VP8_DECODER
+
+    if (vp8_pp_cfg.post_proc_flag
+        && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg))
+    {
+        fprintf(stderr, "Failed to configure postproc: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+#endif
+
+    /* Decode file */
+    while (!read_frame(&input, &buf, &buf_sz, &buf_alloc_sz))
+    {
+        vpx_codec_iter_t  iter = NULL;
+        vpx_image_t    *img;
+        struct vpx_usec_timer timer;
+
+        vpx_usec_timer_start(&timer);
+
+        if (vpx_codec_decode(&decoder, buf, buf_sz, NULL, 0))
+        {
+            const char *detail = vpx_codec_error_detail(&decoder);
+            fprintf(stderr, "Failed to decode frame: %s\n", vpx_codec_error(&decoder));
+
+            if (detail)
+                fprintf(stderr, "  Additional information: %s\n", detail);
+
+            goto fail;
+        }
+
+        vpx_usec_timer_mark(&timer);
+        dx_time += vpx_usec_timer_elapsed(&timer);
+
+        ++frame_in;
+
+        if ((img = vpx_codec_get_frame(&decoder, &iter)))
+            ++frame_out;
+
+        if (progress)
+            show_progress(frame_in, frame_out, dx_time);
+
+        if (!noblit)
+        {
+            if (img)
+            {
+                unsigned int y;
+                char out_fn[PATH_MAX];
+                uint8_t *buf;
+
+                if (!single_file)
+                {
+                    size_t len = sizeof(out_fn)-1;
+
+                    out_fn[len] = '\0';
+                    generate_filename(outfile_pattern, out_fn, len-1,
+                                      img->d_w, img->d_h, frame_in);
+                    out = out_open(out_fn, do_md5);
+                }
+                else if(use_y4m)
+                    out_put(out, (unsigned char *)"FRAME\n", 6, do_md5);
+
+                buf = img->planes[VPX_PLANE_Y];
+
+                for (y = 0; y < img->d_h; y++)
+                {
+                    out_put(out, buf, img->d_w, do_md5);
+                    buf += img->stride[VPX_PLANE_Y];
+                }
+
+                buf = img->planes[flipuv?VPX_PLANE_V:VPX_PLANE_U];
+
+                for (y = 0; y < (1 + img->d_h) / 2; y++)
+                {
+                    out_put(out, buf, (1 + img->d_w) / 2, do_md5);
+                    buf += img->stride[VPX_PLANE_U];
+                }
+
+                buf = img->planes[flipuv?VPX_PLANE_U:VPX_PLANE_V];
+
+                for (y = 0; y < (1 + img->d_h) / 2; y++)
+                {
+                    out_put(out, buf, (1 + img->d_w) / 2, do_md5);
+                    buf += img->stride[VPX_PLANE_V];
+                }
+
+                if (!single_file)
+                    out_close(out, out_fn, do_md5);
+            }
+        }
+
+        if (stop_after && frame_in >= stop_after)
+            break;
+    }
+
+    if (summary || progress)
+    {
+        show_progress(frame_in, frame_out, dx_time);
+        fprintf(stderr, "\n");
+    }
+
+fail:
+
+    if (vpx_codec_destroy(&decoder))
+    {
+        fprintf(stderr, "Failed to destroy decoder: %s\n", vpx_codec_error(&decoder));
+        return EXIT_FAILURE;
+    }
+
+    if (single_file && !noblit)
+        out_close(out, outfile, do_md5);
+
+    if(input.nestegg_ctx)
+        nestegg_destroy(input.nestegg_ctx);
+    if(input.kind != WEBM_FILE)
+        free(buf);
+    fclose(infile);
+    free(argv);
+
+    return EXIT_SUCCESS;
+}
diff --git a/ivfenc.c b/vpxenc.c
index ad172927b..032e2e8d8 100644
--- a/ivfenc.c
+++ b/vpxenc.c
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <string.h>
+#include <limits.h>
 #include "vpx/vpx_encoder.h"
 #if USE_POSIX_MMAP
 #include <sys/types.h>
@@ -30,10 +31,32 @@
 #include <fcntl.h>
 #include <unistd.h>
 #endif
+#include "vpx_version.h"
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
+#include "tools_common.h"
 #include "y4minput.h"
+#include "libmkv/EbmlWriter.h"
+#include "libmkv/EbmlIDs.h"
+
+/* Need special handling of these functions on Windows */
+#if defined(_MSC_VER)
+/* MSVS doesn't define off_t, and uses _f{seek,tell}i64 */
+typedef __int64 off_t;
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#elif defined(_WIN32)
+/* MinGW defines off_t, and uses f{seek,tell}o64 */
+#define fseeko fseeko64
+#define ftello ftello64
+#endif
+
+#if defined(_MSC_VER)
+#define LITERALU64(n) n
+#else
+#define LITERALU64(n) n##LLU
+#endif
 
 static const char *exec_name;
 
@@ -191,7 +214,7 @@ void stats_write(stats_io_t *stats, const void *pkt, size_t len)
 {
     if (stats->file)
     {
-        fwrite(pkt, 1, len, stats->file);
+        if(fwrite(pkt, 1, len, stats->file));
     }
     else
     {
@@ -237,10 +260,11 @@ static int read_frame(FILE *f, vpx_image_t *img, unsigned int file_type,
                       y4m_input *y4m, struct detect_buffer *detect)
 {
     int plane = 0;
+    int shortread = 0;
 
     if (file_type == FILE_TYPE_Y4M)
     {
-        if (y4m_input_fetch_frame(y4m, f, img) < 0)
+        if (y4m_input_fetch_frame(y4m, f, img) < 1)
            return 0;
     }
     else
@@ -253,7 +277,7 @@ static int read_frame(FILE *f, vpx_image_t *img, unsigned int file_type,
              * write_ivf_frame_header() for documentation on the frame header
              * layout.
              */
-            fread(junk, 1, IVF_FRAME_HDR_SZ, f);
+            if(fread(junk, 1, IVF_FRAME_HDR_SZ, f));
         }
 
         for (plane = 0; plane < 3; plane++)
@@ -284,18 +308,18 @@ static int read_frame(FILE *f, vpx_image_t *img, unsigned int file_type,
                 if (detect->valid)
                 {
                     memcpy(ptr, detect->buf, 4);
-                    fread(ptr+4, 1, w-4, f);
+                    shortread |= fread(ptr+4, 1, w-4, f) < w-4;
                     detect->valid = 0;
                 }
                 else
-                    fread(ptr, 1, w, f);
+                    shortread |= fread(ptr, 1, w, f) < w;
 
                 ptr += img->stride[plane];
             }
         }
     }
 
-    return !feof(f);
+    return !shortread;
 }
 
 
@@ -374,7 +398,7 @@ static void write_ivf_file_header(FILE *outfile,
     mem_put_le32(header + 24, frame_cnt);         /* length */
     mem_put_le32(header + 28, 0);                 /* unused */
 
-    fwrite(header, 1, 32, outfile);
+    if(fwrite(header, 1, 32, outfile));
 }
 
 
@@ -392,11 +416,426 @@ static void write_ivf_frame_header(FILE *outfile,
     mem_put_le32(header + 4, pts & 0xFFFFFFFF);
     mem_put_le32(header + 8, pts >> 32);
 
-    fwrite(header, 1, 12, outfile);
+    if(fwrite(header, 1, 12, outfile));
+}
+
+
+typedef off_t EbmlLoc;
+
+
+struct cue_entry
+{
+    unsigned int time;
+    uint64_t     loc;
+};
+
+
+struct EbmlGlobal
+{
+    int debug;
+
+    FILE    *stream;
+    uint64_t last_pts_ms;
+    vpx_rational_t  framerate;
+
+    /* These pointers are to the start of an element */
+    off_t    position_reference;
+    off_t    seek_info_pos;
+    off_t    segment_info_pos;
+    off_t    track_pos;
+    off_t    cue_pos;
+    off_t    cluster_pos;
+
+    /* This pointer is to a specific element to be serialized */
+    off_t    track_id_pos;
+
+    /* These pointers are to the size field of the element */
+    EbmlLoc  startSegment;
+    EbmlLoc  startCluster;
+
+    uint32_t cluster_timecode;
+    int      cluster_open;
+
+    struct cue_entry *cue_list;
+    unsigned int      cues;
+
+};
+
+
+void Ebml_Write(EbmlGlobal *glob, const void *buffer_in, unsigned long len)
+{
+    if(fwrite(buffer_in, 1, len, glob->stream));
+}
+
+
+void Ebml_Serialize(EbmlGlobal *glob, const void *buffer_in, unsigned long len)
+{
+    const unsigned char *q = (const unsigned char *)buffer_in + len - 1;
+
+    for(; len; len--)
+        Ebml_Write(glob, q--, 1);
+}
+
+
+/* Need a fixed size serializer for the track ID. libmkv provdes a 64 bit
+ * one, but not a 32 bit one.
+ */
+static void Ebml_SerializeUnsigned32(EbmlGlobal *glob, unsigned long class_id, uint64_t ui)
+{
+    unsigned char sizeSerialized = 4 | 0x80;
+    Ebml_WriteID(glob, class_id);
+    Ebml_Serialize(glob, &sizeSerialized, 1);
+    Ebml_Serialize(glob, &ui, 4);
+}
+
+
+static void
+Ebml_StartSubElement(EbmlGlobal *glob, EbmlLoc *ebmlLoc,
+                          unsigned long class_id)
+{
+    //todo this is always taking 8 bytes, this may need later optimization
+    //this is a key that says lenght unknown
+    unsigned long long unknownLen =  LITERALU64(0x01FFFFFFFFFFFFFF);
+
+    Ebml_WriteID(glob, class_id);
+    *ebmlLoc = ftello(glob->stream);
+    Ebml_Serialize(glob, &unknownLen, 8);
+}
+
+static void
+Ebml_EndSubElement(EbmlGlobal *glob, EbmlLoc *ebmlLoc)
+{
+    off_t pos;
+    uint64_t size;
+
+    /* Save the current stream pointer */
+    pos = ftello(glob->stream);
+
+    /* Calculate the size of this element */
+    size = pos - *ebmlLoc - 8;
+    size |=  LITERALU64(0x0100000000000000);
+
+    /* Seek back to the beginning of the element and write the new size */
+    fseeko(glob->stream, *ebmlLoc, SEEK_SET);
+    Ebml_Serialize(glob, &size, 8);
+
+    /* Reset the stream pointer */
+    fseeko(glob->stream, pos, SEEK_SET);
 }
 
+
+static void
+write_webm_seek_element(EbmlGlobal *ebml, unsigned long id, off_t pos)
+{
+    uint64_t offset = pos - ebml->position_reference;
+    EbmlLoc start;
+    Ebml_StartSubElement(ebml, &start, Seek);
+    Ebml_SerializeBinary(ebml, SeekID, id);
+    Ebml_SerializeUnsigned64(ebml, SeekPosition, offset);
+    Ebml_EndSubElement(ebml, &start);
+}
+
+
+static void
+write_webm_seek_info(EbmlGlobal *ebml)
+{
+
+    off_t pos;
+
+    /* Save the current stream pointer */
+    pos = ftello(ebml->stream);
+
+    if(ebml->seek_info_pos)
+        fseeko(ebml->stream, ebml->seek_info_pos, SEEK_SET);
+    else
+        ebml->seek_info_pos = pos;
+
+    {
+        EbmlLoc start;
+
+        Ebml_StartSubElement(ebml, &start, SeekHead);
+        write_webm_seek_element(ebml, Tracks, ebml->track_pos);
+        write_webm_seek_element(ebml, Cues,   ebml->cue_pos);
+        write_webm_seek_element(ebml, Info,   ebml->segment_info_pos);
+        Ebml_EndSubElement(ebml, &start);
+    }
+    {
+        //segment info
+        EbmlLoc startInfo;
+        uint64_t frame_time;
+
+        frame_time = (uint64_t)1000 * ebml->framerate.den
+                     / ebml->framerate.num;
+        ebml->segment_info_pos = ftello(ebml->stream);
+        Ebml_StartSubElement(ebml, &startInfo, Info);
+        Ebml_SerializeUnsigned(ebml, TimecodeScale, 1000000);
+        Ebml_SerializeFloat(ebml, Segment_Duration,
+                            ebml->last_pts_ms + frame_time);
+        Ebml_SerializeString(ebml, 0x4D80,
+            ebml->debug ? "vpxenc" : "vpxenc" VERSION_STRING);
+        Ebml_SerializeString(ebml, 0x5741,
+            ebml->debug ? "vpxenc" : "vpxenc" VERSION_STRING);
+        Ebml_EndSubElement(ebml, &startInfo);
+    }
+}
+
+
+static void
+write_webm_file_header(EbmlGlobal                *glob,
+                       const vpx_codec_enc_cfg_t *cfg,
+                       const struct vpx_rational *fps)
+{
+    {
+        EbmlLoc start;
+        Ebml_StartSubElement(glob, &start, EBML);
+        Ebml_SerializeUnsigned(glob, EBMLVersion, 1);
+        Ebml_SerializeUnsigned(glob, EBMLReadVersion, 1); //EBML Read Version
+        Ebml_SerializeUnsigned(glob, EBMLMaxIDLength, 4); //EBML Max ID Length
+        Ebml_SerializeUnsigned(glob, EBMLMaxSizeLength, 8); //EBML Max Size Length
+        Ebml_SerializeString(glob, DocType, "webm"); //Doc Type
+        Ebml_SerializeUnsigned(glob, DocTypeVersion, 2); //Doc Type Version
+        Ebml_SerializeUnsigned(glob, DocTypeReadVersion, 2); //Doc Type Read Version
+        Ebml_EndSubElement(glob, &start);
+    }
+    {
+        Ebml_StartSubElement(glob, &glob->startSegment, Segment); //segment
+        glob->position_reference = ftello(glob->stream);
+        glob->framerate = *fps;
+        write_webm_seek_info(glob);
+
+        {
+            EbmlLoc trackStart;
+            glob->track_pos = ftello(glob->stream);
+            Ebml_StartSubElement(glob, &trackStart, Tracks);
+            {
+                unsigned int trackNumber = 1;
+                uint64_t     trackID = 0;
+
+                EbmlLoc start;
+                Ebml_StartSubElement(glob, &start, TrackEntry);
+                Ebml_SerializeUnsigned(glob, TrackNumber, trackNumber);
+                glob->track_id_pos = ftello(glob->stream);
+                Ebml_SerializeUnsigned32(glob, TrackUID, trackID);
+                Ebml_SerializeUnsigned(glob, TrackType, 1); //video is always 1
+                Ebml_SerializeString(glob, CodecID, "V_VP8");
+                {
+                    unsigned int pixelWidth = cfg->g_w;
+                    unsigned int pixelHeight = cfg->g_h;
+                    float        frameRate   = (float)fps->num/(float)fps->den;
+
+                    EbmlLoc videoStart;
+                    Ebml_StartSubElement(glob, &videoStart, Video);
+                    Ebml_SerializeUnsigned(glob, PixelWidth, pixelWidth);
+                    Ebml_SerializeUnsigned(glob, PixelHeight, pixelHeight);
+                    Ebml_SerializeFloat(glob, FrameRate, frameRate);
+                    Ebml_EndSubElement(glob, &videoStart); //Video
+                }
+                Ebml_EndSubElement(glob, &start); //Track Entry
+            }
+            Ebml_EndSubElement(glob, &trackStart);
+        }
+        // segment element is open
+    }
+}
+
+
+static void
+write_webm_block(EbmlGlobal                *glob,
+                 const vpx_codec_enc_cfg_t *cfg,
+                 const vpx_codec_cx_pkt_t  *pkt)
+{
+    unsigned long  block_length;
+    unsigned char  track_number;
+    unsigned short block_timecode = 0;
+    unsigned char  flags;
+    uint64_t       pts_ms;
+    int            start_cluster = 0, is_keyframe;
+
+    /* Calculate the PTS of this frame in milliseconds */
+    pts_ms = pkt->data.frame.pts * 1000
+             * (uint64_t)cfg->g_timebase.num / (uint64_t)cfg->g_timebase.den;
+    if(pts_ms <= glob->last_pts_ms)
+        pts_ms = glob->last_pts_ms + 1;
+    glob->last_pts_ms = pts_ms;
+
+    /* Calculate the relative time of this block */
+    if(pts_ms - glob->cluster_timecode > SHRT_MAX)
+        start_cluster = 1;
+    else
+        block_timecode = pts_ms - glob->cluster_timecode;
+
+    is_keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY);
+    if(start_cluster || is_keyframe)
+    {
+        if(glob->cluster_open)
+            Ebml_EndSubElement(glob, &glob->startCluster);
+
+        /* Open the new cluster */
+        block_timecode = 0;
+        glob->cluster_open = 1;
+        glob->cluster_timecode = pts_ms;
+        glob->cluster_pos = ftello(glob->stream);
+        Ebml_StartSubElement(glob, &glob->startCluster, Cluster); //cluster
+        Ebml_SerializeUnsigned(glob, Timecode, glob->cluster_timecode);
+
+        /* Save a cue point if this is a keyframe. */
+        if(is_keyframe)
+        {
+            struct cue_entry *cue;
+
+            glob->cue_list = realloc(glob->cue_list,
+                                     (glob->cues+1) * sizeof(struct cue_entry));
+            cue = &glob->cue_list[glob->cues];
+            cue->time = glob->cluster_timecode;
+            cue->loc = glob->cluster_pos;
+            glob->cues++;
+        }
+    }
+
+    /* Write the Simple Block */
+    Ebml_WriteID(glob, SimpleBlock);
+
+    block_length = pkt->data.frame.sz + 4;
+    block_length |= 0x10000000;
+    Ebml_Serialize(glob, &block_length, 4);
+
+    track_number = 1;
+    track_number |= 0x80;
+    Ebml_Write(glob, &track_number, 1);
+
+    Ebml_Serialize(glob, &block_timecode, 2);
+
+    flags = 0;
+    if(is_keyframe)
+        flags |= 0x80;
+    if(pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE)
+        flags |= 0x08;
+    Ebml_Write(glob, &flags, 1);
+
+    Ebml_Write(glob, pkt->data.frame.buf, pkt->data.frame.sz);
+}
+
+
+static void
+write_webm_file_footer(EbmlGlobal *glob, long hash)
+{
+
+    if(glob->cluster_open)
+        Ebml_EndSubElement(glob, &glob->startCluster);
+
+    {
+        EbmlLoc start;
+        int i;
+
+        glob->cue_pos = ftello(glob->stream);
+        Ebml_StartSubElement(glob, &start, Cues);
+        for(i=0; i<glob->cues; i++)
+        {
+            struct cue_entry *cue = &glob->cue_list[i];
+            EbmlLoc start;
+
+            Ebml_StartSubElement(glob, &start, CuePoint);
+            {
+                EbmlLoc start;
+
+                Ebml_SerializeUnsigned(glob, CueTime, cue->time);
+
+                Ebml_StartSubElement(glob, &start, CueTrackPositions);
+                Ebml_SerializeUnsigned(glob, CueTrack, 1);
+                Ebml_SerializeUnsigned64(glob, CueClusterPosition,
+                                         cue->loc - glob->position_reference);
+                //Ebml_SerializeUnsigned(glob, CueBlockNumber, cue->blockNumber);
+                Ebml_EndSubElement(glob, &start);
+            }
+            Ebml_EndSubElement(glob, &start);
+        }
+        Ebml_EndSubElement(glob, &start);
+    }
+
+    Ebml_EndSubElement(glob, &glob->startSegment);
+
+    /* Patch up the seek info block */
+    write_webm_seek_info(glob);
+
+    /* Patch up the track id */
+    fseeko(glob->stream, glob->track_id_pos, SEEK_SET);
+    Ebml_SerializeUnsigned32(glob, TrackUID, glob->debug ? 0xDEADBEEF : hash);
+
+    fseeko(glob->stream, 0, SEEK_END);
+}
+
+
+/* Murmur hash derived from public domain reference implementation at
+ *   http://sites.google.com/site/murmurhash/
+ */
+static unsigned int murmur ( const void * key, int len, unsigned int seed )
+{
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    unsigned int h = seed ^ len;
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k;
+
+        k  = data[0];
+        k |= data[1] << 8;
+        k |= data[2] << 16;
+        k |= data[3] << 24;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0];
+            h *= m;
+    };
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#include "math.h"
+
+static double vp8_mse2psnr(double Samples, double Peak, double Mse)
+{
+    double psnr;
+
+    if ((double)Mse > 0.0)
+        psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+    else
+        psnr = 60;      // Limit to prevent / 0
+
+    if (psnr > 60)
+        psnr = 60;
+
+    return psnr;
+}
+
+
 #include "args.h"
 
+static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
+        "Debug mode (makes output deterministic)");
+static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
+        "Output filename");
 static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0,
                                   "Input file is YV12 ");
 static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0,
@@ -423,10 +862,16 @@ static const arg_def_t verbosearg       = ARG_DEF("v", "verbose", 0,
         "Show encoder parameters");
 static const arg_def_t psnrarg          = ARG_DEF(NULL, "psnr", 0,
         "Show PSNR in status line");
+static const arg_def_t framerate        = ARG_DEF(NULL, "fps", 1,
+        "Stream frame rate (rate/scale)");
+static const arg_def_t use_ivf          = ARG_DEF(NULL, "ivf", 0,
+        "Output IVF (default is WebM)");
 static const arg_def_t *main_args[] =
 {
-    &codecarg, &passes, &pass_arg, &fpf_name, &limit, &deadline, &best_dl, &good_dl, &rt_dl,
-    &verbosearg, &psnrarg,
+    &debugmode,
+    &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &deadline,
+    &best_dl, &good_dl, &rt_dl,
+    &verbosearg, &psnrarg, &use_ivf, &framerate,
     NULL
 };
 
@@ -450,7 +895,7 @@ static const arg_def_t lag_in_frames    = ARG_DEF(NULL, "lag-in-frames", 1,
 static const arg_def_t *global_args[] =
 {
     &use_yv12, &use_i420, &usage, &threads, &profile,
-    &width, &height, &timebase, &error_resilient,
+    &width, &height, &timebase, &framerate, &error_resilient,
     &lag_in_frames, NULL
 };
 
@@ -534,11 +979,11 @@ static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
                                      "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
-                                        "alt_ref Max Frames");
+                                        "AltRef Max Frames");
 static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
-                                       "alt_ref Strength");
+                                       "AltRef Strength");
 static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
-                                   "alt_ref Type");
+                                   "AltRef Type");
 
 static const arg_def_t *vp8_args[] =
 {
@@ -560,20 +1005,21 @@ static void usage_exit()
 {
     int i;
 
-    fprintf(stderr, "Usage: %s <options> src_filename dst_filename\n", exec_name);
+    fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
+            exec_name);
 
-    fprintf(stderr, "\n_options:\n");
+    fprintf(stderr, "\nOptions:\n");
     arg_show_usage(stdout, main_args);
-    fprintf(stderr, "\n_encoder Global Options:\n");
+    fprintf(stderr, "\nEncoder Global Options:\n");
     arg_show_usage(stdout, global_args);
-    fprintf(stderr, "\n_rate Control Options:\n");
+    fprintf(stderr, "\nRate Control Options:\n");
     arg_show_usage(stdout, rc_args);
-    fprintf(stderr, "\n_twopass Rate Control Options:\n");
+    fprintf(stderr, "\nTwopass Rate Control Options:\n");
     arg_show_usage(stdout, rc_twopass_args);
-    fprintf(stderr, "\n_keyframe Placement Options:\n");
+    fprintf(stderr, "\nKeyframe Placement Options:\n");
     arg_show_usage(stdout, kf_args);
 #if CONFIG_VP8_ENCODER
-    fprintf(stderr, "\n_vp8 Specific Options:\n");
+    fprintf(stderr, "\nVP8 Specific Options:\n");
     arg_show_usage(stdout, vp8_args);
 #endif
     fprintf(stderr, "\n"
@@ -614,10 +1060,18 @@ int main(int argc, const char **argv_)
     static const int        *ctrl_args_map = NULL;
     int                      verbose = 0, show_psnr = 0;
     int                      arg_use_i420 = 1;
-    int                      arg_have_timebase = 0;
     unsigned long            cx_time = 0;
     unsigned int             file_type, fourcc;
     y4m_input                y4m;
+    struct vpx_rational      arg_framerate = {30, 1};
+    int                      arg_have_framerate = 0;
+    int                      write_webm = 1;
+    EbmlGlobal               ebml = {0};
+    uint32_t                 hash = 0;
+    uint64_t                 psnr_sse_total = 0;
+    uint64_t                 psnr_samples_total = 0;
+    double                   psnr_totals[4] = {0, 0, 0, 0};
+    int                      psnr_count = 0;
 
     exec_name = argv_[0];
 
@@ -689,6 +1143,17 @@ int main(int argc, const char **argv_)
             arg_limit = arg_parse_uint(&arg);
         else if (arg_match(&arg, &psnrarg, argi))
             show_psnr = 1;
+        else if (arg_match(&arg, &framerate, argi))
+        {
+            arg_framerate = arg_parse_rational(&arg);
+            arg_have_framerate = 1;
+        }
+        else if (arg_match(&arg, &use_ivf, argi))
+            write_webm = 0;
+        else if (arg_match(&arg, &outputfile, argi))
+            out_fn = arg.val;
+        else if (arg_match(&arg, &debugmode, argi))
+            ebml.debug = 1;
         else
             argj++;
     }
@@ -720,6 +1185,17 @@ int main(int argc, const char **argv_)
         return EXIT_FAILURE;
     }
 
+    /* Change the default timebase to a high enough value so that the encoder
+     * will always create strictly increasing timestamps.
+     */
+    cfg.g_timebase.den = 100000;
+
+    /* Never use the library's default resolution, require it be parsed
+     * from the file or set on the command line.
+     */
+    cfg.g_w = 0;
+    cfg.g_h = 0;
+
     /* Now parse the remainder of the parameters. */
     for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step)
     {
@@ -735,10 +1211,7 @@ int main(int argc, const char **argv_)
         else if (arg_match(&arg, &height, argi))
             cfg.g_h = arg_parse_uint(&arg);
         else if (arg_match(&arg, &timebase, argi))
-        {
             cfg.g_timebase = arg_parse_rational(&arg);
-            arg_have_timebase = 1;
-        }
         else if (arg_match(&arg, &error_resilient, argi))
             cfg.g_error_resilient = arg_parse_uint(&arg);
         else if (arg_match(&arg, &lag_in_frames, argi))
@@ -851,21 +1324,25 @@ int main(int argc, const char **argv_)
 
     /* Handle non-option arguments */
     in_fn = argv[0];
-    out_fn = argv[1];
 
-    if (!in_fn || !out_fn)
+    if (!in_fn)
         usage_exit();
 
+    if(!out_fn)
+        die("Error: Output file is required (specify with -o)\n");
+
     memset(&stats, 0, sizeof(stats));
 
     for (pass = one_pass_only ? one_pass_only - 1 : 0; pass < arg_passes; pass++)
     {
         int frames_in = 0, frames_out = 0;
         unsigned long nbytes = 0;
+        size_t detect_bytes;
         struct detect_buffer detect;
 
         /* Parse certain options from the input file, if possible */
-        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb") : stdin;
+        infile = strcmp(in_fn, "-") ? fopen(in_fn, "rb")
+                                    : set_binary_mode(stdin);
 
         if (!infile)
         {
@@ -873,26 +1350,31 @@ int main(int argc, const char **argv_)
             return EXIT_FAILURE;
         }
 
-        fread(detect.buf, 1, 4, infile);
+        /* For RAW input sources, these bytes will applied on the first frame
+         *  in read_frame().
+         * We can always read 4 bytes because the minimum supported frame size
+         *  is 2x2.
+         */
+        detect_bytes = fread(detect.buf, 1, 4, infile);
         detect.valid = 0;
 
-        if (file_is_y4m(infile, &y4m, detect.buf))
+        if (detect_bytes == 4 && file_is_y4m(infile, &y4m, detect.buf))
         {
             if (y4m_input_open(&y4m, infile, detect.buf, 4) >= 0)
             {
                 file_type = FILE_TYPE_Y4M;
                 cfg.g_w = y4m.pic_w;
                 cfg.g_h = y4m.pic_h;
+
                 /* Use the frame rate from the file only if none was specified
                  * on the command-line.
                  */
-                if (!arg_have_timebase)
+                if (!arg_have_framerate)
                 {
-                    cfg.g_timebase.num = y4m.fps_d;
-                    cfg.g_timebase.den = y4m.fps_n;
-                    /* And don't reset it in the second pass.*/
-                    arg_have_timebase = 1;
+                    arg_framerate.num = y4m.fps_n;
+                    arg_framerate.den = y4m.fps_d;
                 }
+
                 arg_use_i420 = 0;
             }
             else
@@ -901,7 +1383,8 @@ int main(int argc, const char **argv_)
                 return EXIT_FAILURE;
             }
         }
-        else if (file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, detect.buf))
+        else if (detect_bytes == 4 &&
+                 file_is_ivf(infile, &fourcc, &cfg.g_w, &cfg.g_h, detect.buf))
         {
             file_type = FILE_TYPE_IVF;
             switch (fourcc)
@@ -922,6 +1405,14 @@ int main(int argc, const char **argv_)
             file_type = FILE_TYPE_RAW;
             detect.valid = 1;
         }
+
+        if(!cfg.g_w || !cfg.g_h)
+        {
+            fprintf(stderr, "Specify stream dimensions with --width (-w) "
+                            " and --height (-h).\n");
+            return EXIT_FAILURE;
+        }
+
 #define SHOW(field) fprintf(stderr, "    %-28s = %d\n", #field, cfg.field)
 
         if (verbose && pass == 0)
@@ -972,16 +1463,10 @@ int main(int argc, const char **argv_)
             else
                 vpx_img_alloc(&raw, arg_use_i420 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_YV12,
                               cfg.g_w, cfg.g_h, 1);
-
-            // This was added so that ivfenc will create monotically increasing
-            // timestamps.  Since we create new timestamps for alt-reference frames
-            // we need to make room in the series of timestamps.  Since there can
-            // only be 1 alt-ref frame ( current bitstream) multiplying by 2
-            // gives us enough room.
-            cfg.g_timebase.den *= 2;
         }
 
-        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb") : stdout;
+        outfile = strcmp(out_fn, "-") ? fopen(out_fn, "wb")
+                                      : set_binary_mode(stdout);
 
         if (!outfile)
         {
@@ -989,6 +1474,12 @@ int main(int argc, const char **argv_)
             return EXIT_FAILURE;
         }
 
+        if(write_webm && fseek(outfile, 0, SEEK_CUR))
+        {
+            fprintf(stderr, "WebM output to pipes not supported.\n");
+            return EXIT_FAILURE;
+        }
+
         if (stats_fn)
         {
             if (!stats_open_file(&stats, stats_fn, pass))
@@ -1018,7 +1509,13 @@ int main(int argc, const char **argv_)
 
 #endif
 
-        write_ivf_file_header(outfile, &cfg, codec->fourcc, 0);
+        if(write_webm)
+        {
+            ebml.stream = outfile;
+            write_webm_file_header(&ebml, &cfg, &arg_framerate);
+        }
+        else
+            write_ivf_file_header(outfile, &cfg, codec->fourcc, 0);
 
 
         /* Construct Encoder Context */
@@ -1047,6 +1544,7 @@ int main(int argc, const char **argv_)
             vpx_codec_iter_t iter = NULL;
             const vpx_codec_cx_pkt_t *pkt;
             struct vpx_usec_timer timer;
+            int64_t frame_start;
 
             if (!arg_limit || frames_in < arg_limit)
             {
@@ -1065,10 +1563,12 @@ int main(int argc, const char **argv_)
 
             vpx_usec_timer_start(&timer);
 
-            // since we halved our timebase we need to double the timestamps
-            // and duration we pass in.
-            vpx_codec_encode(&encoder, frame_avail ? &raw : NULL, (frames_in - 1) * 2,
-                             2, 0, arg_deadline);
+            frame_start = (cfg.g_timebase.den * (int64_t)(frames_in - 1)
+                          * arg_framerate.den) / cfg.g_timebase.num / arg_framerate.num;
+            vpx_codec_encode(&encoder, frame_avail ? &raw : NULL, frame_start,
+                             cfg.g_timebase.den * arg_framerate.den
+                             / cfg.g_timebase.num / arg_framerate.num,
+                             0, arg_deadline);
             vpx_usec_timer_mark(&timer);
             cx_time += vpx_usec_timer_elapsed(&timer);
             ctx_exit_on_error(&encoder, "Failed to encode frame");
@@ -1084,8 +1584,22 @@ int main(int argc, const char **argv_)
                     frames_out++;
                     fprintf(stderr, " %6luF",
                             (unsigned long)pkt->data.frame.sz);
-                    write_ivf_frame_header(outfile, pkt);
-                    fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile);
+
+                    if(write_webm)
+                    {
+                        /* Update the hash */
+                        if(!ebml.debug)
+                            hash = murmur(pkt->data.frame.buf,
+                                          pkt->data.frame.sz, hash);
+
+                        write_webm_block(&ebml, &cfg, pkt);
+                    }
+                    else
+                    {
+                        write_ivf_frame_header(outfile, pkt);
+                        if(fwrite(pkt->data.frame.buf, 1,
+                                  pkt->data.frame.sz, outfile));
+                    }
                     nbytes += pkt->data.raw.sz;
                     break;
                 case VPX_CODEC_STATS_PKT:
@@ -1103,8 +1617,14 @@ int main(int argc, const char **argv_)
                     {
                         int i;
 
+                        psnr_sse_total += pkt->data.psnr.sse[0];
+                        psnr_samples_total += pkt->data.psnr.samples[0];
                         for (i = 0; i < 4; i++)
+                        {
                             fprintf(stderr, "%.3lf ", pkt->data.psnr.psnr[i]);
+                            psnr_totals[i] += pkt->data.psnr.psnr[i];
+                        }
+                        psnr_count++;
                     }
 
                     break;
@@ -1116,24 +1636,43 @@ int main(int argc, const char **argv_)
             fflush(stdout);
         }
 
-        /* this bitrate calc is simplified and relies on the fact that this
-         * application uses 1/timebase for framerate.
-         */
         fprintf(stderr,
                "\rPass %d/%d frame %4d/%-4d %7ldB %7ldb/f %7"PRId64"b/s"
                " %7lu %s (%.2f fps)\033[K", pass + 1,
                arg_passes, frames_in, frames_out, nbytes, nbytes * 8 / frames_in,
-               nbytes * 8 *(int64_t)cfg.g_timebase.den/2/ cfg.g_timebase.num / frames_in,
+               nbytes * 8 *(int64_t)arg_framerate.num / arg_framerate.den / frames_in,
                cx_time > 9999999 ? cx_time / 1000 : cx_time,
                cx_time > 9999999 ? "ms" : "us",
                (float)frames_in * 1000000.0 / (float)cx_time);
 
+        if ( (show_psnr) && (psnr_count>0) )
+        {
+            int i;
+            double ovpsnr = vp8_mse2psnr(psnr_samples_total, 255.0,
+                                         psnr_sse_total);
+
+            fprintf(stderr, "\nPSNR (Overall/Avg/Y/U/V)");
+
+            fprintf(stderr, " %.3lf", ovpsnr);
+            for (i = 0; i < 4; i++)
+            {
+                fprintf(stderr, " %.3lf", psnr_totals[i]/psnr_count);
+            }
+        }
+
         vpx_codec_destroy(&encoder);
 
         fclose(infile);
 
-        if (!fseek(outfile, 0, SEEK_SET))
-            write_ivf_file_header(outfile, &cfg, codec->fourcc, frames_out);
+        if(write_webm)
+        {
+            write_webm_file_footer(&ebml, hash);
+        }
+        else
+        {
+            if (!fseek(outfile, 0, SEEK_SET))
+                write_ivf_file_header(outfile, &cfg, codec->fourcc, frames_out);
+        }
 
         fclose(outfile);
         stats_close(&stats);
diff --git a/y4minput.c b/y4minput.c
index 3eaec4ed3..449afe858 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -877,5 +877,5 @@ int y4m_input_fetch_frame(y4m_input *_y4m,FILE *_fin,vpx_image_t *_img){
   _img->planes[PLANE_Y]=_y4m->dst_buf;
   _img->planes[PLANE_U]=_y4m->dst_buf+pic_sz;
   _img->planes[PLANE_V]=_y4m->dst_buf+pic_sz+c_sz;
-  return 0;
+  return 1;
 }
author	Andreas Huber <andih@google.com>	2010-11-08 11:13:04 -0800
committer	Andreas Huber <andih@google.com>	2010-11-30 11:33:25 -0800
commit	76e0247ec867fcc232fc79f21e9bf85d3c3a5a3f (patch)
tree	7e5b3862358c88041828ddc70e63cbad9ff1a77d
parent	acb586e0536bb0ffc5b3f41dcdedf77fed3f02c5 (diff)
download	libvpx-gingerbread-mr4-release.tar.gz