/*
 *  HalfpelmotionBP.c
 *
 *  This module contains procedures for bandpass motion compensation with
 *  halfpel accuracy. Halfpel-compensation is accomplished by horizontal
 *  or vertical interpolation respectively. There are 2 procedures for
 *  16x16 and 8x8 sized blocks, each with 4 cases for all possible
 *  combinations of hor./vert. interpolation.
 *
 *  For more elaborate comments on VIS implementation see Halfpelmotion.c,
 *  which uses very much the same methods.
 *
 *  $Id: HalfPelMotionBP.c,v 1.1.1.1 1997/12/16 13:14:41 jnweiger Exp $
 */

#include <string.h>

#include "Crop.h"
#include "config.h"
#include "vis.h"


#ifndef USE_VIS

/*
 *  Integer arithmetics can't be acclerated by using parallel adds (like
 *  in Halfpelmotion.c) due to neccessity of final clipping. However it
 *  would be possible to speed-up by collecting 4 sums to 32-bit values. 
 *  Then some shifts can be spared by using 4 different clipping tables.
 */

void CompensateBPHalfPelMotion16(Byte *dec, Byte *ref, Byte *mot, int xh, int yh, int rb) {
  int j;
  register Byte *ref2;
  register unsigned int s1, s2, s3, s4;

  switch ((xh<<1)|yh) {
  case 0:
    for (j=0; j<16; j++) {
      dec[0] = CLIP(dec[0] + ref[0] - mot[0]);
      dec[1] = CLIP(dec[1] + ref[1] - mot[1]);
      dec[2] = CLIP(dec[2] + ref[2] - mot[2]);
      dec[3] = CLIP(dec[3] + ref[3] - mot[3]);
      dec[4] = CLIP(dec[4] + ref[4] - mot[4]);
      dec[5] = CLIP(dec[5] + ref[5] - mot[5]);
      dec[6] = CLIP(dec[6] + ref[6] - mot[6]);
      dec[7] = CLIP(dec[7] + ref[7] - mot[7]);
      dec[8] = CLIP(dec[8] + ref[8] - mot[8]);
      dec[9] = CLIP(dec[9] + ref[9] - mot[9]);
      dec[10] = CLIP(dec[10] + ref[10] - mot[10]);
      dec[11] = CLIP(dec[11] + ref[11] - mot[11]);
      dec[12] = CLIP(dec[12] + ref[12] - mot[12]);
      dec[13] = CLIP(dec[13] + ref[13] - mot[13]);
      dec[14] = CLIP(dec[14] + ref[14] - mot[14]);
      dec[15] = CLIP(dec[15] + ref[15] - mot[15]);
      ref += rb;
      dec += rb;
      mot += rb;
    }
    break;
  case 1:
    ref2 = ref + rb;
    for (j=0; j<16; j++) {
      dec[0] = CLIP(dec[0] + (unsigned int) ((ref[0]+ref2[0]+1)>>1) - mot[0]) ;
      dec[1] = CLIP(dec[1] + (unsigned int) ((ref[1]+ref2[1]+1)>>1) - mot[1]) ;
      dec[2] = CLIP(dec[2] + (unsigned int) ((ref[2]+ref2[2]+1)>>1) - mot[2]) ;
      dec[3] = CLIP(dec[3] + (unsigned int) ((ref[3]+ref2[3]+1)>>1) - mot[3]) ;
      dec[4] = CLIP(dec[4] + (unsigned int) ((ref[4]+ref2[4]+1)>>1) - mot[4]) ;
      dec[5] = CLIP(dec[5] + (unsigned int) ((ref[5]+ref2[5]+1)>>1) - mot[5]) ;
      dec[6] = CLIP(dec[6] + (unsigned int) ((ref[6]+ref2[6]+1)>>1) - mot[6]) ;
      dec[7] = CLIP(dec[7] + (unsigned int) ((ref[7]+ref2[7]+1)>>1) - mot[7]) ;
      dec[8] = CLIP(dec[8] + (unsigned int) ((ref[8]+ref2[8]+1)>>1) - mot[8]) ;
      dec[9] = CLIP(dec[9] + (unsigned int) ((ref[9]+ref2[9]+1)>>1) - mot[9]) ;
      dec[10] = CLIP(dec[10] + (unsigned int) ((ref[10]+ref2[10]+1)>>1) - mot[10]) ;
      dec[11] = CLIP(dec[11] + (unsigned int) ((ref[11]+ref2[11]+1)>>1) - mot[11]) ;
      dec[12] = CLIP(dec[12] + (unsigned int) ((ref[12]+ref2[12]+1)>>1) - mot[12]) ;
      dec[13] = CLIP(dec[13] + (unsigned int) ((ref[13]+ref2[13]+1)>>1) - mot[13]) ;
      dec[14] = CLIP(dec[14] + (unsigned int) ((ref[14]+ref2[14]+1)>>1) - mot[14]) ;
      dec[15] = CLIP(dec[15] + (unsigned int) ((ref[15]+ref2[15]+1)>>1) - mot[15]) ;
      ref += rb;
      ref2 += rb;
      dec += rb;
      mot += rb;
    }
    break;
  case 2:
    for (j=0; j<16; j++) {
      s1 = ref[0];
      dec[0] = CLIP(dec[0] + (unsigned int) ((s1+(s2=ref[1])+1)>>1) - mot[0]) ;
      dec[1] = CLIP(dec[1] + (unsigned int) ((s2+(s1=ref[2])+1)>>1) - mot[1]) ;
      dec[2] = CLIP(dec[2] + (unsigned int) ((s1+(s2=ref[3])+1)>>1) - mot[2]) ;
      dec[3] = CLIP(dec[3] + (unsigned int) ((s2+(s1=ref[4])+1)>>1) - mot[3]) ;
      dec[4] = CLIP(dec[4] + (unsigned int) ((s1+(s2=ref[5])+1)>>1) - mot[4]) ;
      dec[5] = CLIP(dec[5] + (unsigned int) ((s2+(s1=ref[6])+1)>>1) - mot[5]) ;
      dec[6] = CLIP(dec[6] + (unsigned int) ((s1+(s2=ref[7])+1)>>1) - mot[6]) ;
      dec[7] = CLIP(dec[7] + (unsigned int) ((s2+(s1=ref[8])+1)>>1) - mot[7]) ;
      dec[8] = CLIP(dec[8] + (unsigned int) ((s1+(s2=ref[9])+1)>>1) - mot[8]) ;
      dec[9] = CLIP(dec[9] + (unsigned int) ((s2+(s1=ref[10])+1)>>1) - mot[9]) ;
      dec[10] = CLIP(dec[10] + (unsigned int) ((s1+(s2=ref[11])+1)>>1) - mot[10]) ;
      dec[11] = CLIP(dec[11] + (unsigned int) ((s2+(s1=ref[12])+1)>>1) - mot[11]) ;
      dec[12] = CLIP(dec[12] + (unsigned int) ((s1+(s2=ref[13])+1)>>1) - mot[12]) ;
      dec[13] = CLIP(dec[13] + (unsigned int) ((s2+(s1=ref[14])+1)>>1) - mot[13]) ;
      dec[14] = CLIP(dec[14] + (unsigned int) ((s1+(s2=ref[15])+1)>>1) - mot[14]) ;
      dec[15] = CLIP(dec[15] + (unsigned int) ((s2+ref[16]+1)>>1) - mot[15]) ;

      ref += rb;
      dec += rb;
      mot += rb;
    }
    break;
  case 3:
    ref2 = ref + rb;
    for (j=0; j<16; j++) {
      s1 = ref[0]; s3 = ref2[0];
      dec[0] = CLIP(dec[0] + (unsigned int) ((s1+(s2=ref[1])+s3+(s4=ref2[1])+2)>>2) - mot[0]) ;
      dec[1] = CLIP(dec[1] + (unsigned int) ((s2+(s1=ref[2])+s4+(s3=ref2[2])+2)>>2) - mot[1]) ;
      dec[2] = CLIP(dec[2] + (unsigned int) ((s1+(s2=ref[3])+s3+(s4=ref2[3])+2)>>2) - mot[2]) ;
      dec[3] = CLIP(dec[3] + (unsigned int) ((s2+(s1=ref[4])+s4+(s3=ref2[4])+2)>>2) - mot[3]) ;
      dec[4] = CLIP(dec[4] + (unsigned int) ((s1+(s2=ref[5])+s3+(s4=ref2[5])+2)>>2) - mot[4]) ;
      dec[5] = CLIP(dec[5] + (unsigned int) ((s2+(s1=ref[6])+s4+(s3=ref2[6])+2)>>2) - mot[5]) ;
      dec[6] = CLIP(dec[6] + (unsigned int) ((s1+(s2=ref[7])+s3+(s4=ref2[7])+2)>>2) - mot[6]) ;
      dec[7] = CLIP(dec[7] + (unsigned int) ((s2+(s1=ref[8])+s4+(s3=ref2[8])+2)>>2) - mot[7]) ;
      dec[8] = CLIP(dec[8] + (unsigned int) ((s1+(s2=ref[9])+s3+(s4=ref2[9])+2)>>2) - mot[8]) ;
      dec[9] = CLIP(dec[9] + (unsigned int) ((s2+(s1=ref[10])+s4+(s3=ref2[10])+2)>>2) - mot[9]) ;
      dec[10] = CLIP(dec[10] + (unsigned int) ((s1+(s2=ref[11])+s3+(s4=ref2[11])+2)>>2) - mot[10]) ;
      dec[11] = CLIP(dec[11] + (unsigned int) ((s2+(s1=ref[12])+s4+(s3=ref2[12])+2)>>2) - mot[11]) ;
      dec[12] = CLIP(dec[12] + (unsigned int) ((s1+(s2=ref[13])+s3+(s4=ref2[13])+2)>>2) - mot[12]) ;
      dec[13] = CLIP(dec[13] + (unsigned int) ((s2+(s1=ref[14])+s4+(s3=ref2[14])+2)>>2) - mot[13]) ;
      dec[14] = CLIP(dec[14] + (unsigned int) ((s1+(s2=ref[15])+s3+(s4=ref2[15])+2)>>2) - mot[14]) ;
      dec[15] = CLIP(dec[15] + (unsigned int) ((s2+ref[16]+s4+ref2[16]+2)>>2) - mot[15]) ;

      ref += rb;
      ref2 += rb;
      dec += rb;
      mot += rb;
    }
    break;
  }
}

void CompensateBPHalfPelMotion8(Byte *dec, Byte *ref, Byte *mot, int xh, int yh, int rb) {
  int j;
  register Byte *ref2;
  register unsigned int s1, s2, s3, s4;

  switch ((xh<<1)|yh) {
  case 0:
    for (j=0; j<8; j++) {
      dec[0] = CLIP(dec[0] + ref[0] - mot[0]);
      dec[1] = CLIP(dec[1] + ref[1] - mot[1]);
      dec[2] = CLIP(dec[2] + ref[2] - mot[2]);
      dec[3] = CLIP(dec[3] + ref[3] - mot[3]);
      dec[4] = CLIP(dec[4] + ref[4] - mot[4]);
      dec[5] = CLIP(dec[5] + ref[5] - mot[5]);
      dec[6] = CLIP(dec[6] + ref[6] - mot[6]);
      dec[7] = CLIP(dec[7] + ref[7] - mot[7]);

      ref += rb;
      dec += rb;
      mot += rb;
    }
    break;
  case 1:
    ref2 = ref + rb;
    for (j=0; j<8; j++) {
      dec[0] = CLIP(dec[0] + (unsigned int) ((ref[0]+ref2[0]+1)>>1) - mot[0]) ;
      dec[1] = CLIP(dec[1] + (unsigned int) ((ref[1]+ref2[1]+1)>>1) - mot[1]) ;
      dec[2] = CLIP(dec[2] + (unsigned int) ((ref[2]+ref2[2]+1)>>1) - mot[2]) ;
      dec[3] = CLIP(dec[3] + (unsigned int) ((ref[3]+ref2[3]+1)>>1) - mot[3]) ;
      dec[4] = CLIP(dec[4] + (unsigned int) ((ref[4]+ref2[4]+1)>>1) - mot[4]) ;
      dec[5] = CLIP(dec[5] + (unsigned int) ((ref[5]+ref2[5]+1)>>1) - mot[5]) ;
      dec[6] = CLIP(dec[6] + (unsigned int) ((ref[6]+ref2[6]+1)>>1) - mot[6]) ;
      dec[7] = CLIP(dec[7] + (unsigned int) ((ref[7]+ref2[7]+1)>>1) - mot[7]) ;

      ref += rb;
      ref2 += rb;
      dec += rb;
      mot += rb;
    }
    break;
  case 2:
    for (j=0; j<8; j++) {
      s1 = ref[0];
      dec[0] = CLIP(dec[0] + (unsigned int) ((s1+(s2=ref[1])+1)>>1) - mot[0]) ;
      dec[1] = CLIP(dec[1] + (unsigned int) ((s2+(s1=ref[2])+1)>>1) - mot[1]) ;
      dec[2] = CLIP(dec[2] + (unsigned int) ((s1+(s2=ref[3])+1)>>1) - mot[2]) ;
      dec[3] = CLIP(dec[3] + (unsigned int) ((s2+(s1=ref[4])+1)>>1) - mot[3]) ;
      dec[4] = CLIP(dec[4] + (unsigned int) ((s1+(s2=ref[5])+1)>>1) - mot[4]) ;
      dec[5] = CLIP(dec[5] + (unsigned int) ((s2+(s1=ref[6])+1)>>1) - mot[5]) ;
      dec[6] = CLIP(dec[6] + (unsigned int) ((s1+(s2=ref[7])+1)>>1) - mot[6]) ;
      dec[7] = CLIP(dec[7] + (unsigned int) ((s2+ref[8]+1)>>1) - mot[7]) ;

      ref += rb;
      dec += rb;
      mot += rb;
    }
    break;
  case 3:
    ref2 = ref + rb;
    for (j=0; j<8; j++) {
      s1 = ref[0]; s3 = ref2[0];
      dec[0] = CLIP(dec[0] + (unsigned int) ((s1+(s2=ref[1])+s3+(s4=ref2[1])+2)>>2) - mot[0]) ;
      dec[1] = CLIP(dec[1] + (unsigned int) ((s2+(s1=ref[2])+s4+(s3=ref2[2])+2)>>2) - mot[1]) ;
      dec[2] = CLIP(dec[2] + (unsigned int) ((s1+(s2=ref[3])+s3+(s4=ref2[3])+2)>>2) - mot[2]) ;
      dec[3] = CLIP(dec[3] + (unsigned int) ((s2+(s1=ref[4])+s4+(s3=ref2[4])+2)>>2) - mot[3]) ;
      dec[4] = CLIP(dec[4] + (unsigned int) ((s1+(s2=ref[5])+s3+(s4=ref2[5])+2)>>2) - mot[4]) ;
      dec[5] = CLIP(dec[5] + (unsigned int) ((s2+(s1=ref[6])+s4+(s3=ref2[6])+2)>>2) - mot[5]) ;
      dec[6] = CLIP(dec[6] + (unsigned int) ((s1+(s2=ref[7])+s3+(s4=ref2[7])+2)>>2) - mot[6]) ;
      dec[7] = CLIP(dec[7] + (unsigned int) ((s2+ref[8]+s4+ref2[8]+2)>>2) - mot[7]) ;

      ref += rb;
      ref2 += rb;
      dec += rb;
      mot += rb;
    }
    break;
  }
}

#else

/* ========================================================================= *
 * =====                      VIS  IMPLEMENTATION                       ==== *
 * ========================================================================= */
/*
 *  Please see Halfpelmotion.c for a description of the most important
 *  implementation issues. Only one additional aspect arises with bandpass
 *  motion compensation:
 *  - the division by 2 after the interpolation isn't the last step in
 *    the computation, so pack normally can't be used to perform it.
 *    However if the addends mot and ref are multiplied by two
 *    respectively to the operands for interpolation, no divide is
 *    needed inbetween interpolation-and bandpass-additions. Using
 *    fmul8x6 to expand mot and ref by 5 bits.
 */

void CompensateBPHalfPelMotion16(dst,src,mot,xh,yh,rb)
Byte *src,*dst,*mot;
int xh,yh,rb;
{
  Byte *src_a;               /* aligned source and destination */
  int i;                     /* loop counter */
  vis_d64 round,             /* addend for rounding */
          scale;             /* for shifting via fmul16x16 */
    /*     vis_d64     null = vis_fzero(); */
    int off = (unsigned long)src & 7;  /* misalignment */

  switch ((xh<<1)|yh) {
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 0:

    /* ================  COPY 16x16 ================== */


    vis_write_gsr((7-4)<<3);       /* GSR-scale must only compensate fexpand */
    scale = vis_to_double_quad(1<<(4+8));
    src_a = vis_alignaddr(src,0);
 
    if(!off) {
      vis_d64 s1,s2,d1,d2,m1,m2,s11,s12,s21,s22,d11,d12,d21,d22,
              m11,m12,m21,m22,r11,r12,r21,r22;

 
      for(i=16;i;i--) {
        s1 = vis_ld64(src_a);
        s2 = vis_ld64(src_a+8); src_a += rb;
        d1 = vis_ld64(dst);
        d2 = vis_ld64(dst+8);
        m1 = vis_ld64(mot);
        m2 = vis_ld64(mot+8); mot += rb;
	/*
	 *  This is a little trick: using fmul8x16 instead of fexpand here,
	 *  because it can be executed in parallel to other fexpands or
	 *  add/sub instructions ("mul" uses FGM, "add" FGA pipeline)
	 */
        s11 = vis_fmul8x16_hi(s1,scale); s12 = vis_fexpand_lo(s1);
        s21 = vis_fmul8x16_hi(s2,scale); s22 = vis_fmul8x16_lo(s2,scale);
        d11 = vis_fmul8x16_hi(d1,scale); d12 = vis_fexpand_lo(d1);
        d21 = vis_fmul8x16_hi(d2,scale); d22 = vis_fmul8x16_lo(d2,scale);
        m11 = vis_fmul8x16_hi(m1,scale); m12 = vis_fexpand_lo(m1);
        m21 = vis_fmul8x16_hi(m2,scale); m22 = vis_fmul8x16_lo(m2,scale);
        r11 = vis_fpsub16(vis_fpadd16(s11,d11),m11);
        r12 = vis_fpsub16(vis_fpadd16(s12,d12),m12);
        r21 = vis_fpsub16(vis_fpadd16(s21,d21),m21);
        r22 = vis_fpsub16(vis_fpadd16(s22,d22),m22);
        vis_st64_pack2(r11,r12, dst);
        vis_st64_pack2(r21,r22, dst+8); dst += rb;
      }
    }
    else {
      vis_d64 s1,s2,s1a,s1b,s1c,d1,d2,m1,m2,s11,s12,s21,s22,d11,d12,d21,d22,
              m11,m12,m21,m22,r11,r12,r21,r22;
 
      for(i=16;i;i--) {
        s1a = vis_ld64(src_a);
        s1b = vis_ld64(src_a+8);
        s1c = vis_ld64(src_a+16); src_a += rb;
        d1 = vis_ld64(dst);
        d2 = vis_ld64(dst+8);
        s1 = vis_faligndata(s1a,s1b);
        s2 = vis_faligndata(s1b,s1c);
        m1 = vis_ld64(mot);
        m2 = vis_ld64(mot+8); mot += rb;
        s11 = vis_fmul8x16_hi(s1,scale); s12 = vis_fexpand_lo(s1);
        s21 = vis_fmul8x16_hi(s2,scale); s22 = vis_fexpand_lo(s2);
        d11 = vis_fmul8x16_hi(d1,scale); d12 = vis_fexpand_lo(d1);
        d21 = vis_fmul8x16_hi(d2,scale); d22 = vis_fexpand_lo(d2);
        m11 = vis_fmul8x16_hi(m1,scale); m12 = vis_fexpand_lo(m1);
        m21 = vis_fmul8x16_hi(m2,scale); m22 = vis_fexpand_lo(m2);
        r11 = vis_fpsub16(vis_fpadd16(s11,d11),m11);
        r12 = vis_fpsub16(vis_fpadd16(s12,d12),m12);
        r21 = vis_fpsub16(vis_fpadd16(s21,d21),m21);
        r22 = vis_fpsub16(vis_fpadd16(s22,d22),m22);
        vis_st64_pack2(r11,r12, dst);
        vis_st64_pack2(r21,r22, dst+8); dst += rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 1:

    /* ================  INTERPOLATE 16x16 VERTICAL ================== */


    round = vis_to_double_quad(1<<4);
    scale = vis_to_double_quad(2<<(4+8));
    vis_write_gsr((7-4-1)<<3);     /* divide by 2 and reverse expand */
    src_a = vis_alignaddr(src,0);
 
    if(!off) {
      vis_d64 s11,s12,s11h,s11l,s12h,s12l,s21,m11,d11,d12,d21,d22,s21h,s21l,
              s22h,s22,s22l,m11h,m11l,m12h,m12,m12l,m21h,m21,m21l,m22h,m22,m22l,
	      d11h,d11l,d12h,d12l,d21h,d21l,d22h,d22l,r11h,r11l,r12h,r12l,
	      r21h,r21l,r22h,r22l;
 
      s11 = vis_ld64(src);
      s12 = vis_ld64(src+8); src += rb;
      s11h = vis_fexpand_hi(s11); s11l = vis_fexpand_lo(s11);
      s12h = vis_fexpand_hi(s12); s12l = vis_fexpand_lo(s12);
      s11h = vis_fpadd16(round,s11h); s11l = vis_fpadd16(round,s11l);
      s12h = vis_fpadd16(round,s12h); s12l = vis_fpadd16(round,s12l);
 
      for(i=8;i;i--) {
        s21 = vis_ld64(src);
        s22 = vis_ld64(src+8); src += rb;
        m11 = vis_ld64(mot);
        m12 = vis_ld64(mot+8); mot += rb;
        d11 = vis_ld64(dst);
        d12 = vis_ld64(dst+8);
        s11 = vis_ld64(src);
        s12 = vis_ld64(src+8); src += rb;
        m21 = vis_ld64(mot);
        m22 = vis_ld64(mot+8); mot += rb;
        d21 = vis_ld64(dst+rb);
        d22 = vis_ld64(dst+rb+8);
        s21h = vis_fexpand_hi(s21); s21l = vis_fexpand_lo(s21);
        s22h = vis_fexpand_hi(s22); s22l = vis_fexpand_lo(s22);
        m11h = vis_fmul8x16_hi(m11,scale); m11l = vis_fmul8x16_lo(m11,scale);
        m12h = vis_fmul8x16_hi(m12,scale); m12l = vis_fmul8x16_lo(m12,scale);
        d11h = vis_fmul8x16_hi(d11,scale); d11l = vis_fmul8x16_lo(d11,scale);
        d12h = vis_fmul8x16_hi(d12,scale); d12l = vis_fmul8x16_lo(d12,scale);
        m21h = vis_fmul8x16_hi(m21,scale); m21l = vis_fmul8x16_lo(m21,scale);
        m22h = vis_fmul8x16_hi(m22,scale); m22l = vis_fmul8x16_lo(m22,scale);
        d21h = vis_fmul8x16_hi(d21,scale); d21l = vis_fmul8x16_lo(d21,scale);
        d22h = vis_fmul8x16_hi(d22,scale); d22l = vis_fmul8x16_lo(d22,scale);
        r11h = vis_fpsub16(vis_fpadd16(d11h,vis_fpadd16(s11h,s21h)),m11h);
        r11l = vis_fpsub16(vis_fpadd16(d11l,vis_fpadd16(s11l,s21l)),m11l);
        r12h = vis_fpsub16(vis_fpadd16(d12h,vis_fpadd16(s12h,s22h)),m12h);
        r12l = vis_fpsub16(vis_fpadd16(d12l,vis_fpadd16(s12l,s22l)),m12l);
        s11h = vis_fexpand_hi(s11); s11l = vis_fexpand_lo(s11);
        s12h = vis_fexpand_hi(s12); s12l = vis_fexpand_lo(s12);
	s11h = vis_fpadd16(round,s11h); s11l = vis_fpadd16(round,s11l);
	s12h = vis_fpadd16(round,s12h); s12l = vis_fpadd16(round,s12l);
        r21h = vis_fpsub16(vis_fpadd16(d21h,vis_fpadd16(s11h,s21h)),m21h);
        r21l = vis_fpsub16(vis_fpadd16(d21l,vis_fpadd16(s11l,s21l)),m21l);
        r22h = vis_fpsub16(vis_fpadd16(d22h,vis_fpadd16(s12h,s22h)),m22h);
        r22l = vis_fpsub16(vis_fpadd16(d22l,vis_fpadd16(s12l,s22l)),m22l);
        vis_st64_pack2(r11h, r11l, dst);
        vis_st64_pack2(r12h, r12l, dst+8);
        vis_st64_pack2(r21h, r21l, dst+rb);
        vis_st64_pack2(r22h, r22l, dst+rb+8); dst += 2*rb;
      }
    }
    else {
      /*
       *  same as above, only with additional misalignment handling
       */
      vis_d64 s1a,s1b,s1c,s2a,s2b,s2c,s11,s12,s11h,s11l,s12h,s12l,s21,m11,
              d11,d12,d21,d22,s21h,s21l,s22h,s22,s22l,m11h,m11l,m12h,m12,m12l,
	      m21h,m21,m21l,m22h,m22,m22l,d11h,d11l,d12h,d12l,
	      d21h,d21l,d22h,d22l,r11h,r11l,r12h,r12l,r21h,r21l,r22h,r22l;
 
      s1a = vis_ld64(src_a);
      s1b = vis_ld64(src_a+8);
      s1c = vis_ld64(src_a+16); src_a += rb;
      s11 = vis_faligndata(s1a,s1b);
      s12 = vis_faligndata(s1b,s1c);
      s11h = vis_fexpand_hi(s11); s11l = vis_fexpand_lo(s11);
      s12h = vis_fexpand_hi(s12); s12l = vis_fexpand_lo(s12);
      s11h = vis_fpadd16(round,s11h); s11l = vis_fpadd16(round,s11l);
      s12h = vis_fpadd16(round,s12h); s12l = vis_fpadd16(round,s12l);
 
      for(i=8;i;i--) {
	s2a = vis_ld64(src_a);
	s2b = vis_ld64(src_a+8);
	s2c = vis_ld64(src_a+16); src_a += rb;
        m11 = vis_ld64(mot);
        m12 = vis_ld64(mot+8); mot += rb;
        d11 = vis_ld64(dst);
        d12 = vis_ld64(dst+8);
	s1a = vis_ld64(src_a);
	s1b = vis_ld64(src_a+8);
	s1c = vis_ld64(src_a+16); src_a += rb;
	s21 = vis_faligndata(s2a,s2b);
	s22 = vis_faligndata(s2b,s2c);
        m21 = vis_ld64(mot);
        m22 = vis_ld64(mot+8); mot += rb;
        d21 = vis_ld64(dst+rb);
        d22 = vis_ld64(dst+rb+8);
	s11 = vis_faligndata(s1a,s1b);
	s12 = vis_faligndata(s1b,s1c);
	/* same as case [off==0] from here on */
        s21h = vis_fexpand_hi(s21); s21l = vis_fexpand_lo(s21);
        s22h = vis_fexpand_hi(s22); s22l = vis_fexpand_lo(s22);
        m11h = vis_fmul8x16_hi(m11,scale); m11l = vis_fmul8x16_lo(m11,scale);
        m12h = vis_fmul8x16_hi(m12,scale); m12l = vis_fmul8x16_lo(m12,scale);
        d11h = vis_fmul8x16_hi(d11,scale); d11l = vis_fmul8x16_lo(d11,scale);
        d12h = vis_fmul8x16_hi(d12,scale); d12l = vis_fmul8x16_lo(d12,scale);
        m21h = vis_fmul8x16_hi(m21,scale); m21l = vis_fmul8x16_lo(m21,scale);
        m22h = vis_fmul8x16_hi(m22,scale); m22l = vis_fmul8x16_lo(m22,scale);
        d21h = vis_fmul8x16_hi(d21,scale); d21l = vis_fmul8x16_lo(d21,scale);
        d22h = vis_fmul8x16_hi(d22,scale); d22l = vis_fmul8x16_lo(d22,scale);
        r11h = vis_fpsub16(vis_fpadd16(d11h,vis_fpadd16(s11h,s21h)),m11h);
        r11l = vis_fpsub16(vis_fpadd16(d11l,vis_fpadd16(s11l,s21l)),m11l);
        r12h = vis_fpsub16(vis_fpadd16(d12h,vis_fpadd16(s12h,s22h)),m12h);
        r12l = vis_fpsub16(vis_fpadd16(d12l,vis_fpadd16(s12l,s22l)),m12l);
        s11h = vis_fexpand_hi(s11); s11l = vis_fexpand_lo(s11);
        s12h = vis_fexpand_hi(s12); s12l = vis_fexpand_lo(s12);
	s11h = vis_fpadd16(round,s11h); s11l = vis_fpadd16(round,s11l);
	s12h = vis_fpadd16(round,s12h); s12l = vis_fpadd16(round,s12l);
        r21h = vis_fpsub16(vis_fpadd16(d21h,vis_fpadd16(s11h,s21h)),m21h);
        r21l = vis_fpsub16(vis_fpadd16(d21l,vis_fpadd16(s11l,s21l)),m21l);
        r22h = vis_fpsub16(vis_fpadd16(d22h,vis_fpadd16(s12h,s22h)),m22h);
        r22l = vis_fpsub16(vis_fpadd16(d22l,vis_fpadd16(s12l,s22l)),m22l);
        vis_st64_pack2(r11h, r11l, dst);
        vis_st64_pack2(r12h, r12l, dst+8);
        vis_st64_pack2(r21h, r21l, dst+rb);
        vis_st64_pack2(r22h, r22l, dst+rb+8); dst += 2*rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 2:

    /* ================  INTERPOLATE 16x16 HORIZONTAL ================== */


    round = vis_to_double_quad(1<<4);      /* round for division by 2 */
    vis_write_gsr((7-4-1)<<3);             /* pack and divide by 2 */
    scale = vis_to_double_quad(2<<(4+8));  /* expand and multiply by 2 */
    src_a = vis_alignaddr(src,0);          /* 8-aligned source pointer */
 
    if(!off) {
      vis_d64 s11,s12,s1n,s13,s14,d11,d12,m11,m12,m11h,m11l,m12h,m12l,
	      d11h,d11l,d12h,d12l,s11h,s11l,s12h,s12l,s13h,s13l,s14h,s14l,
	      r11h,r11l,r12h,r12l;
 
      vis_alignaddr_const(1);
 
      for(i=16;i;i--) {
        s11 = vis_ld64(src_a);
        s12 = vis_ld64(src_a+8);
        s1n = vis_ld64(src_a+16); src_a += rb;
        m11 = vis_ld64(mot);
        m12 = vis_ld64(mot+8); mot += rb;
        d11 = vis_ld64(dst);
        d12 = vis_ld64(dst+8);
        s13 = vis_faligndata(s11,s12);
        s14 = vis_faligndata(s12,s1n);
        m11h = vis_fmul8x16_hi(m11,scale); m11l = vis_fmul8x16_lo(m11,scale);
        m12h = vis_fmul8x16_hi(m12,scale); m12l = vis_fmul8x16_lo(m12,scale);
        d11h = vis_fmul8x16_hi(d11,scale); d11l = vis_fmul8x16_lo(d11,scale);
        d12h = vis_fmul8x16_hi(d12,scale); d12l = vis_fmul8x16_lo(d12,scale);
        s11h = vis_fpadd16(round,vis_fexpand_hi(s11));
        s11l = vis_fpadd16(round,vis_fexpand_lo(s11));
        s12h = vis_fpadd16(round,vis_fexpand_hi(s12));
        s12l = vis_fpadd16(round,vis_fexpand_lo(s12));
        s13h = vis_fexpand_hi(s13);
        s13l = vis_fexpand_lo(s13);
        s14h = vis_fexpand_hi(s14);
        s14l = vis_fexpand_lo(s14);
        r11h = vis_fpsub16(vis_fpadd16(d11h,vis_fpadd16(s11h,s13h)),m11h);
        r11l = vis_fpsub16(vis_fpadd16(d11l,vis_fpadd16(s11l,s13l)),m11l);
        r12h = vis_fpsub16(vis_fpadd16(d12h,vis_fpadd16(s12h,s14h)),m12h);
        r12l = vis_fpsub16(vis_fpadd16(d12l,vis_fpadd16(s12l,s14l)),m12l);
        vis_st64_pack2(r11h,r11l, dst);
        vis_st64_pack2(r12h,r12l, dst+8); dst += rb;
      }
    }
    else if(off == 7) {
      vis_d64 s11,s12,s1p,s13,s14,d11,d12,m11,m12,m11h,m11l,m12h,m12l,
	      d11h,d11l,d12h,d12l,s11h,s11l,s12h,s12l,s13h,s13l,s14h,s14l,
	      r11h,r11l,r12h,r12l;
 
      for(i=16;i;i--) {
        s1p = vis_ld64(src_a);
        s13 = vis_ld64(src_a+8);
        s14 = vis_ld64(src_a+16); src_a += rb;
        m11 = vis_ld64(mot);
        m12 = vis_ld64(mot+8); mot += rb;
        d11 = vis_ld64(dst);
        d12 = vis_ld64(dst+8);
        s11 = vis_faligndata(s1p,s13);
        s12 = vis_faligndata(s13,s14);
        /* same as above from here on */
        m11h = vis_fmul8x16_hi(m11,scale); m11l = vis_fmul8x16_lo(m11,scale);
        m12h = vis_fmul8x16_hi(m12,scale); m12l = vis_fmul8x16_lo(m12,scale);
        d11h = vis_fmul8x16_hi(d11,scale); d11l = vis_fmul8x16_lo(d11,scale);
        d12h = vis_fmul8x16_hi(d12,scale); d12l = vis_fmul8x16_lo(d12,scale);
        s11h = vis_fpadd16(round,vis_fexpand_hi(s11));
        s11l = vis_fpadd16(round,vis_fexpand_lo(s11));
        s12h = vis_fpadd16(round,vis_fexpand_hi(s12));
        s12l = vis_fpadd16(round,vis_fexpand_lo(s12));
        s13h = vis_fexpand_hi(s13);
        s13l = vis_fexpand_lo(s13);
        s14h = vis_fexpand_hi(s14);
        s14l = vis_fexpand_lo(s14);
        r11h = vis_fpsub16(vis_fpadd16(d11h,vis_fpadd16(s11h,s13h)),m11h);
        r11l = vis_fpsub16(vis_fpadd16(d11l,vis_fpadd16(s11l,s13l)),m11l);
        r12h = vis_fpsub16(vis_fpadd16(d12h,vis_fpadd16(s12h,s14h)),m12h);
        r12l = vis_fpsub16(vis_fpadd16(d12l,vis_fpadd16(s12l,s14l)),m12l);
        vis_st64_pack2(r11h,r11l, dst);
        vis_st64_pack2(r12h,r12l, dst+8); dst += rb;
      }
    }
    else {
      vis_d64 s1a,s1b,s1c,s11,s12,s13,s14,d11,d12,m11,m12,m11h,m11l,m12h,m12l,
	      d11h,d11l,d12h,d12l,s11h,s11l,s12h,s12l,s13h,s13l,s14h,s14l,
	      r11h,r11l,r12h,r12l;
 
      for(i=16;i;i--) {
        s1a = vis_ld64(src_a);
        s1b = vis_ld64(src_a+8);
        s1c = vis_ld64(src_a+16); src_a += rb;
        m11 = vis_ld64(mot);
        m12 = vis_ld64(mot+8); mot += rb;
        d11 = vis_ld64(dst);
        d12 = vis_ld64(dst+8);
        s11 = vis_faligndata(s1a,s1b);
        s12 = vis_faligndata(s1b,s1c);
	vis_alignaddr_const(off+1);
        m11h = vis_fmul8x16_hi(m11,scale); m11l = vis_fmul8x16_lo(m11,scale);
        m12h = vis_fmul8x16_hi(m12,scale); m12l = vis_fmul8x16_lo(m12,scale);
        d11h = vis_fmul8x16_hi(d11,scale); d11l = vis_fmul8x16_lo(d11,scale);
        d12h = vis_fmul8x16_hi(d12,scale); d12l = vis_fmul8x16_lo(d12,scale);
	s13 = vis_faligndata(s1a,s1b);
	s14 = vis_faligndata(s1b,s1c);
	vis_alignaddr_const(off);
        s11h = vis_fpadd16(round,vis_fexpand_hi(s11));
        s11l = vis_fpadd16(round,vis_fexpand_lo(s11));
        s12h = vis_fpadd16(round,vis_fexpand_hi(s12));
        s12l = vis_fpadd16(round,vis_fexpand_lo(s12));
        s13h = vis_fexpand_hi(s13);
        s13l = vis_fexpand_lo(s13);
        s14h = vis_fexpand_hi(s14);
        s14l = vis_fexpand_lo(s14);
        r11h = vis_fpsub16(vis_fpadd16(d11h,vis_fpadd16(s11h,s13h)),m11h);
        r11l = vis_fpsub16(vis_fpadd16(d11l,vis_fpadd16(s11l,s13l)),m11l);
        r12h = vis_fpsub16(vis_fpadd16(d12h,vis_fpadd16(s12h,s14h)),m12h);
        r12l = vis_fpsub16(vis_fpadd16(d12l,vis_fpadd16(s12l,s14l)),m12l);
        vis_st64_pack2(r11h,r11l, dst);
        vis_st64_pack2(r12h,r12l, dst+8); dst += rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 3:

    /* ============  INTERPOLATE 16x16 HORIZONTAL & VERTICAL  ============= */

    round = vis_to_double_quad(2<<4);      /* added to round divide by 4 */
    scale = vis_to_double_quad(4<<(4+8));  /* expand and multiply by 4 */
    vis_write_gsr((7-4-2)<<3);             /* divide by 4 and pack */
    src_a = vis_alignaddr(src,0);
 
    if(!off) {
      vis_d64 s11,s12,s1n,s13,s14,s21,s22,s2n,s23,s24,s11sh,s11sl,s12sh,s12sl,
              s21sh,s21sl,s22sh,s22sl,d11,d12,d21,d22,m11,m12,m21,m22,
	      d11h,d11l,d12h,d12l,d21h,d21l,d22h,d22l,m11h,m11l,m12h,m12l,
	      m21h,m21l,m22h,m22l,r11h,r11l,r12h,r12l,r21h,r21l,r22h,r22l;
 
      vis_alignaddr_const(1);
 
      s11 = vis_ld64(src_a);
      s12 = vis_ld64(src_a+8);
      s1n = vis_ld64(src_a+16); src_a += rb;
      s13 = vis_faligndata(s11,s12);
      s14 = vis_faligndata(s12,s1n);
      s11sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
                                            vis_fexpand_hi(s13)));
      s11sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
                                            vis_fexpand_lo(s13)));
      s12sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s12),
                                            vis_fexpand_hi(s14)));
      s12sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s12),
                                            vis_fexpand_lo(s14)));
 
      for(i=8;i;i--) {
	s21 = vis_ld64(src_a);
	s22 = vis_ld64(src_a+8);
	s2n = vis_ld64(src_a+16); src_a += rb;
        d11 = vis_ld64(dst);
        d12 = vis_ld64(dst+8);
        d21 = vis_ld64(dst+rb);
        d22 = vis_ld64(dst+rb+8);
        m11 = vis_ld64(mot);
        m12 = vis_ld64(mot+8); mot += rb;
        m21 = vis_ld64(mot);
        m22 = vis_ld64(mot+8); mot += rb;
        s23 = vis_faligndata(s21,s22);
        s24 = vis_faligndata(s22,s2n);
        d11h = vis_fmul8x16_hi(d11,scale); d11l = vis_fmul8x16_lo(d11,scale);
        d12h = vis_fmul8x16_hi(d12,scale); d12l = vis_fmul8x16_lo(d12,scale);
        d21h = vis_fmul8x16_hi(d21,scale); d21l = vis_fmul8x16_lo(d21,scale);
        d22h = vis_fmul8x16_hi(d22,scale); d22l = vis_fmul8x16_lo(d22,scale);
        m11h = vis_fmul8x16_hi(m11,scale); m11l = vis_fmul8x16_lo(m11,scale);
        m12h = vis_fmul8x16_hi(m12,scale); m12l = vis_fmul8x16_lo(m12,scale);
        m21h = vis_fmul8x16_hi(m21,scale); m21l = vis_fmul8x16_lo(m21,scale);
        m22h = vis_fmul8x16_hi(m22,scale); m22l = vis_fmul8x16_lo(m22,scale);
	s21sh = vis_fpadd16(vis_fexpand_hi(s21), vis_fexpand_hi(s23));
	s21sl = vis_fpadd16(vis_fexpand_lo(s21), vis_fexpand_lo(s23));
	s22sh = vis_fpadd16(vis_fexpand_hi(s22), vis_fexpand_hi(s24));
	s22sl = vis_fpadd16(vis_fexpand_lo(s22), vis_fexpand_lo(s24));
        r11h = vis_fpadd16(s11sh,s21sh); r11l = vis_fpadd16(s11sl,s21sl);
        r12h = vis_fpadd16(s12sh,s22sh); r12l = vis_fpadd16(s12sl,s22sl);
        r11h = vis_fpsub16(vis_fpadd16(r11h,d11h),m11h);
        r11l = vis_fpsub16(vis_fpadd16(r11l,d11l),m11l);
        r12h = vis_fpsub16(vis_fpadd16(r12h,d12h),m12h);
        r12l = vis_fpsub16(vis_fpadd16(r12l,d12l),m12l);
	s11 = vis_ld64(src_a);
	s12 = vis_ld64(src_a+8);
	s1n = vis_ld64(src_a+16); src_a += rb;
	s13 = vis_faligndata(s11,s12);
	s14 = vis_faligndata(s12,s1n);
        vis_st64_pack2(r11h,r11l, dst);
        vis_st64_pack2(r12h,r12l, dst+8); dst += rb;
	s11sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
					      vis_fexpand_hi(s13)));
	s11sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
					      vis_fexpand_lo(s13)));
	s12sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s12),
					      vis_fexpand_hi(s14)));
	s12sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s12),
					      vis_fexpand_lo(s14)));
        r21h = vis_fpadd16(s11sh,s21sh); r21l = vis_fpadd16(s11sl,s21sl);
        r22h = vis_fpadd16(s12sh,s22sh); r22l = vis_fpadd16(s12sl,s22sl);
        r21h = vis_fpsub16(vis_fpadd16(r21h,d21h),m21h);
        r21l = vis_fpsub16(vis_fpadd16(r21l,d21l),m21l);
        r22h = vis_fpsub16(vis_fpadd16(r22h,d22h),m22h);
        r22l = vis_fpsub16(vis_fpadd16(r22l,d22l),m22l);
        vis_st64_pack2(r21h,r21l, dst);
        vis_st64_pack2(r22h,r22l, dst+8); dst += rb;
      }
    }
    else if(off == 7) {
      vis_d64 s11,s12,s1p,s13,s14,s21,s22,s2p,s23,s24,s11sh,s11sl,s12sh,s12sl,
              s21sh,s21sl,s22sh,s22sl,d11,d12,d21,d22,m11,m12,m21,m22,
	      d11h,d11l,d12h,d12l,d21h,d21l,d22h,d22l,m11h,m11l,m12h,m12l,
	      m21h,m21l,m22h,m22l,r11h,r11l,r12h,r12l,r21h,r21l,r22h,r22l;
 
      s1p = vis_ld64(src_a);
      s13 = vis_ld64(src_a+8);
      s14 = vis_ld64(src_a+16); src_a += rb;
      s11 = vis_faligndata(s1p,s13);
      s12 = vis_faligndata(s13,s14);
      s11sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
                                            vis_fexpand_hi(s13)));
      s11sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
                                            vis_fexpand_lo(s13)));
      s12sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s12),
                                            vis_fexpand_hi(s14)));
      s12sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s12),
                                            vis_fexpand_lo(s14)));
 
      for(i=8;i;i--) {
	s2p = vis_ld64(src_a);
	s23 = vis_ld64(src_a+8);
	s24 = vis_ld64(src_a+16); src_a += rb;
        d11 = vis_ld64(dst);
        d12 = vis_ld64(dst+8);
        d21 = vis_ld64(dst+rb);
        d22 = vis_ld64(dst+rb+8);
        m11 = vis_ld64(mot);
        m12 = vis_ld64(mot+8); mot += rb;
        m21 = vis_ld64(mot);
        m22 = vis_ld64(mot+8); mot += rb;
        s21 = vis_faligndata(s2p,s23);
        s22 = vis_faligndata(s23,s24);
	/* same as above until next ld64 */
        d11h = vis_fmul8x16_hi(d11,scale); d11l = vis_fmul8x16_lo(d11,scale);
        d12h = vis_fmul8x16_hi(d12,scale); d12l = vis_fmul8x16_lo(d12,scale);
        d21h = vis_fmul8x16_hi(d21,scale); d21l = vis_fmul8x16_lo(d21,scale);
        d22h = vis_fmul8x16_hi(d22,scale); d22l = vis_fmul8x16_lo(d22,scale);
        m11h = vis_fmul8x16_hi(m11,scale); m11l = vis_fmul8x16_lo(m11,scale);
        m12h = vis_fmul8x16_hi(m12,scale); m12l = vis_fmul8x16_lo(m12,scale);
        m21h = vis_fmul8x16_hi(m21,scale); m21l = vis_fmul8x16_lo(m21,scale);
        m22h = vis_fmul8x16_hi(m22,scale); m22l = vis_fmul8x16_lo(m22,scale);
	s21sh = vis_fpadd16(vis_fexpand_hi(s21), vis_fexpand_hi(s23));
	s21sl = vis_fpadd16(vis_fexpand_lo(s21), vis_fexpand_lo(s23));
	s22sh = vis_fpadd16(vis_fexpand_hi(s22), vis_fexpand_hi(s24));
	s22sl = vis_fpadd16(vis_fexpand_lo(s22), vis_fexpand_lo(s24));
        r11h = vis_fpadd16(s11sh,s21sh); r11l = vis_fpadd16(s11sl,s21sl);
        r12h = vis_fpadd16(s12sh,s22sh); r12l = vis_fpadd16(s12sl,s22sl);
        r11h = vis_fpsub16(vis_fpadd16(r11h,d11h),m11h);
        r11l = vis_fpsub16(vis_fpadd16(r11l,d11l),m11l);
        r12h = vis_fpsub16(vis_fpadd16(r12h,d12h),m12h);
        r12l = vis_fpsub16(vis_fpadd16(r12l,d12l),m12l);
	s1p = vis_ld64(src_a);
	s13 = vis_ld64(src_a+8);
	s14 = vis_ld64(src_a+16); src_a += rb;
	s11 = vis_faligndata(s1p,s13);
	s12 = vis_faligndata(s13,s14);
        vis_st64_pack2(r11h,r11l, dst);
        vis_st64_pack2(r12h,r12l, dst+8); dst += rb;
	s11sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
					      vis_fexpand_hi(s13)));
	s11sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
					      vis_fexpand_lo(s13)));
	s12sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s12),
					      vis_fexpand_hi(s14)));
	s12sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s12),
					      vis_fexpand_lo(s14)));
        r21h = vis_fpadd16(s11sh,s21sh); r21l = vis_fpadd16(s11sl,s21sl);
        r22h = vis_fpadd16(s12sh,s22sh); r22l = vis_fpadd16(s12sl,s22sl);
        r21h = vis_fpsub16(vis_fpadd16(r21h,d21h),m21h);
        r21l = vis_fpsub16(vis_fpadd16(r21l,d21l),m21l);
        r22h = vis_fpsub16(vis_fpadd16(r22h,d22h),m22h);
        r22l = vis_fpsub16(vis_fpadd16(r22l,d22l),m22l);
        vis_st64_pack2(r21h,r21l, dst);
        vis_st64_pack2(r22h,r22l, dst+8); dst += rb;
      }
    }
    else {
      vis_d64 s1a,s1b,s1c,s11,s12,s13,s14,s2a,s2b,s2c,s21,s22,s23,s24,
              s11sh,s11sl,s12sh,s12sl,s21sh,s21sl,s22sh,s22sl,
	      m11,m12,m21,m22,m11h,m11l,m12h,m12l,m21h,m21l,m22h,m22l,
	      d11,d12,d21,d22,d11h,d11l,d12h,d12l,d21h,d21l,d22h,d22l,
	      r11h,r11l,r12h,r12l,r21h,r21l,r22h,r22l;
 
      s1a = vis_ld64(src_a);
      s1b = vis_ld64(src_a+8);
      s1c = vis_ld64(src_a+16); src_a += rb;
      s11 = vis_faligndata(s1a,s1b);
      s12 = vis_faligndata(s1b,s1c);
      vis_alignaddr_const(off+1);
      s13 = vis_faligndata(s1a,s1b);
      s14 = vis_faligndata(s1b,s1c);
      vis_alignaddr_const(off);
      s11sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
                                            vis_fexpand_hi(s13)));
      s11sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
                                            vis_fexpand_lo(s13)));
      s12sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s12),
                                            vis_fexpand_hi(s14)));
      s12sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s12),
                                            vis_fexpand_lo(s14)));
 
      /*
       *  Statements mixed (instead of blocked) b/c of lack of registers
       */
      for(i=8;i;i--) {
	s2a = vis_ld64(src_a);
	s2b = vis_ld64(src_a+8);
	s2c = vis_ld64(src_a+16); src_a += rb;
	s1a = vis_ld64(src_a);
	s1b = vis_ld64(src_a+8);
	s1c = vis_ld64(src_a+16); src_a += rb;
	s21 = vis_faligndata(s2a,s2b);
	s22 = vis_faligndata(s2b,s2c);
	s11 = vis_faligndata(s1a,s1b);
	s12 = vis_faligndata(s1b,s1c);
	vis_alignaddr_const(off+1);
	s23 = vis_faligndata(s2a,s2b);
	s24 = vis_faligndata(s2b,s2c);
	s13 = vis_faligndata(s1a,s1b);
	s14 = vis_faligndata(s1b,s1c);
	vis_alignaddr_const(off);
        d11 = vis_ld64(dst);
        d12 = vis_ld64(dst+8);
        d21 = vis_ld64(dst+rb);
        d22 = vis_ld64(dst+rb+8);
        m11 = vis_ld64(mot);
        m12 = vis_ld64(mot+8); mot += rb;
        m21 = vis_ld64(mot);
        m22 = vis_ld64(mot+8); mot += rb;
        d11h = vis_fmul8x16_hi(d11,scale); d11l = vis_fmul8x16_lo(d11,scale);
        m11h = vis_fmul8x16_hi(m11,scale); m11l = vis_fmul8x16_lo(m11,scale);
	s21sh = vis_fpadd16(vis_fexpand_hi(s21), vis_fexpand_hi(s23));
	s21sl = vis_fpadd16(vis_fexpand_lo(s21), vis_fexpand_lo(s23));
        r11h = vis_fpadd16(s11sh,s21sh); r11l = vis_fpadd16(s11sl,s21sl);
        r11h = vis_fpsub16(vis_fpadd16(r11h,d11h),m11h);
        r11l = vis_fpsub16(vis_fpadd16(r11l,d11l),m11l);
        d12h = vis_fmul8x16_hi(d12,scale); d12l = vis_fmul8x16_lo(d12,scale);
        m12h = vis_fmul8x16_hi(m12,scale); m12l = vis_fmul8x16_lo(m12,scale);
	s22sh = vis_fpadd16(vis_fexpand_hi(s22), vis_fexpand_hi(s24));
	s22sl = vis_fpadd16(vis_fexpand_lo(s22), vis_fexpand_lo(s24));
        r12h = vis_fpadd16(s12sh,s22sh); r12l = vis_fpadd16(s12sl,s22sl);
        r12h = vis_fpsub16(vis_fpadd16(r12h,d12h),m12h);
        r12l = vis_fpsub16(vis_fpadd16(r12l,d12l),m12l);
        vis_st64_pack2(r11h,r11l, dst);
        vis_st64_pack2(r12h,r12l, dst+8); dst += rb;
        d21h = vis_fmul8x16_hi(d21,scale); d21l = vis_fmul8x16_lo(d21,scale);
        d22h = vis_fmul8x16_hi(d22,scale); d22l = vis_fmul8x16_lo(d22,scale);
        m21h = vis_fmul8x16_hi(m21,scale); m21l = vis_fmul8x16_lo(m21,scale);
        m22h = vis_fmul8x16_hi(m22,scale); m22l = vis_fmul8x16_lo(m22,scale);
	s11sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
					      vis_fexpand_hi(s13)));
	s11sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
					      vis_fexpand_lo(s13)));
	s12sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s12),
					      vis_fexpand_hi(s14)));
	s12sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s12),
					      vis_fexpand_lo(s14)));
        r21h = vis_fpadd16(s11sh,s21sh); r21l = vis_fpadd16(s11sl,s21sl);
        r22h = vis_fpadd16(s12sh,s22sh); r22l = vis_fpadd16(s12sl,s22sl);
        r21h = vis_fpsub16(vis_fpadd16(r21h,d21h),m21h);
        r21l = vis_fpsub16(vis_fpadd16(r21l,d21l),m21l);
        r22h = vis_fpsub16(vis_fpadd16(r22h,d22h),m22h);
        r22l = vis_fpsub16(vis_fpadd16(r22l,d22l),m22l);
        vis_st64_pack2(r21h,r21l, dst);
        vis_st64_pack2(r22h,r22l, dst+8); dst += rb;
      }
    }
    break;
  }
}

/* ========================================================================== */

void
CompensateBPHalfPelMotion8(dst,src,mot,xh,yh,rb)
Byte *src,*dst,*mot;
int xh,yh,rb;
{
  Byte *src_a;               /* aligned source pointer */
  int i;                     /* loop counter */
  vis_d64 round,             /* addend for rounding */
          scale;             /* for shifting via fmul16x16 */
    /*     vis_d64     null = vis_fzero(); */
  int off = (unsigned long)src & 7;  /* misalignment */

  switch ((xh<<1)|yh) {
  case 0:

    /* ================  8x8  ================== */


    vis_write_gsr((7-4)<<3);       /* GSR-scale must only compensate fexpand */
    scale = vis_to_double_quad(1<<(4+8));
    src_a = vis_alignaddr(src,0);

    if(!off) {
      vis_d64 s1,d1,m1,s11,s12,d11,d12,m11,m12,r11,r12;

      for(i=8;i;i--) {
	s1 = vis_ld64(src_a); src_a += rb;
	d1 = vis_ld64(dst);
	m1 = vis_ld64(mot); mot += rb;
	s11 = vis_fexpand_hi(s1);
	s12 = vis_fmul8x16_lo(s1,scale);
	d11 = vis_fexpand_hi(d1);
	d12 = vis_fmul8x16_lo(d1,scale);
	m11 = vis_fexpand_hi(m1);
	m12 = vis_fmul8x16_lo(m1,scale);
	r11 = vis_fpsub16(vis_fpadd16(s11,d11),m11);
	r12 = vis_fpsub16(vis_fpadd16(s12,d12),m12);
	vis_st64_pack2(r11,r12, dst); dst += rb;
      }
    }
    else {
      vis_d64 s1a,s1b,s1,d1,m1,s11,s12,d11,d12,m11,m12,r11,r12;

      for(i=8;i;i--) {
	s1a = vis_ld64(src_a);
	s1b = vis_ld64(src_a+8); src_a += rb;
	s1 = vis_faligndata(s1a,s1b);
	d1 = vis_ld64(dst);
	m1 = vis_ld64(mot); mot += rb;
	s11 = vis_fexpand_hi(s1);
	s12 = vis_fmul8x16_lo(s1,scale);
	d11 = vis_fexpand_hi(d1);
	d12 = vis_fmul8x16_lo(d1,scale);
	m11 = vis_fexpand_hi(m1);
	m12 = vis_fmul8x16_lo(m1,scale);
	r11 = vis_fpsub16(vis_fpadd16(s11,d11),m11);
	r12 = vis_fpsub16(vis_fpadd16(s12,d12),m12);
	vis_st64_pack2(r11,r12, dst); dst += rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 1:

    /* ================  INTERPOLATE 8x8 VERTICAL  ================== */


    round = vis_to_double_quad(1<<4);
    vis_write_gsr((7-4-1)<<3);     /* divide by 2 and reverse expand */
    scale = vis_to_double_quad(2<<(4+8));  /* expand and multiply by 2 */
    src_a = vis_alignaddr(src,0);

    if(!off) {                          /* source spans only 1 quadword */
      vis_d64 s1,s2,m1,m2,d1,d2,s1h,s1l,s2h,s2l,m1h,m1l,d1h,d1l,m2h,m2l,d2h,d2l,
              r1h,r1l,r2h,r2l;

      s1 = vis_ld64(src); src += rb;
      s1h = vis_fpadd16(round,vis_fexpand_hi(s1));
      s1l = vis_fpadd16(round,vis_fexpand_lo(s1));

      for(i=4;i;i--) {
	m1 = vis_ld64(mot);
	m2 = vis_ld64(mot+rb); mot += 2*rb;
	s2 = vis_ld64(src);
	s1 = vis_ld64(src+rb); src += 2*rb;
	d1 = vis_ld64(dst);
	d2 = vis_ld64(dst+rb);
	m1h = vis_fmul8x16_hi(m1,scale);
	m1l = vis_fmul8x16_lo(m1,scale);
	m2h = vis_fmul8x16_hi(m2,scale);
	m2l = vis_fmul8x16_lo(m2,scale);
	s2h = vis_fexpand_hi(s2);
	s2l = vis_fexpand_lo(s2);
	d1h = vis_fmul8x16_hi(d1,scale);
	d1l = vis_fmul8x16_lo(d1,scale);
	d2h = vis_fmul8x16_hi(d2,scale);
	d2l = vis_fmul8x16_lo(d2,scale);
	r1h = vis_fpsub16(vis_fpadd16(d1h,vis_fpadd16(s1h,s2h)),m1h);
	r1l = vis_fpsub16(vis_fpadd16(d1l,vis_fpadd16(s1l,s2l)),m1l);
	s1h = vis_fpadd16(round,vis_fexpand_hi(s1));
	s1l = vis_fpadd16(round,vis_fexpand_lo(s1));
	r2h = vis_fpsub16(vis_fpadd16(d2h,vis_fpadd16(s1h,s2h)),m2h);
	r2l = vis_fpsub16(vis_fpadd16(d2l,vis_fpadd16(s1l,s2l)),m2l);
	vis_st64_pack2(r1h, r1l, dst);
	vis_st64_pack2(r2h, r2l, dst+rb); dst += 2*rb;
      }
    }
    else {
      vis_d64 s1,s2,m1,m2,d1,d2,m1h,m1l,s1h,s1l,s2h,s2l,d1h,d1l,m2h,m2l,
              d2h,d2l,r1h,r1l,r2h,r2l,s1a,s1b,s2a,s2b;

      s1a = vis_ld64(src_a);
      s1b = vis_ld64(src_a+8); src_a += rb;
      s1 = vis_faligndata(s1a,s1b);
      s1h = vis_fpadd16(round,vis_fexpand_hi(s1));
      s1l = vis_fpadd16(round,vis_fexpand_lo(s1));

      for(i=4;i;i--) {
	s2a = vis_ld64(src_a);
	s2b = vis_ld64(src_a+8); src_a += rb;
	s1a = vis_ld64(src_a);
	s1b = vis_ld64(src_a+8); src_a += rb;
	m1 = vis_ld64(mot);
	m2 = vis_ld64(mot+rb); mot += 2*rb;
	d1 = vis_ld64(dst);
	d2 = vis_ld64(dst+rb);
	s2 = vis_faligndata(s2a,s2b);
	s1 = vis_faligndata(s1a,s1b);
        m1h = vis_fmul8x16_hi(m1,scale);
        m1l = vis_fmul8x16_lo(m1,scale);
        m2h = vis_fmul8x16_hi(m2,scale);
        m2l = vis_fmul8x16_lo(m2,scale);
        s2h = vis_fexpand_hi(s2);
        s2l = vis_fexpand_lo(s2);
        d1h = vis_fmul8x16_hi(d1,scale);
        d1l = vis_fmul8x16_lo(d1,scale);
        d2h = vis_fmul8x16_hi(d2,scale);
        d2l = vis_fmul8x16_lo(d2,scale);
        r1h = vis_fpsub16(vis_fpadd16(d1h,vis_fpadd16(s1h,s2h)),m1h);
        r1l = vis_fpsub16(vis_fpadd16(d1l,vis_fpadd16(s1l,s2l)),m1l);
        s1h = vis_fpadd16(round,vis_fexpand_hi(s1));
        s1l = vis_fpadd16(round,vis_fexpand_lo(s1));
        r2h = vis_fpsub16(vis_fpadd16(d2h,vis_fpadd16(s1h,s2h)),m2h);
        r2l = vis_fpsub16(vis_fpadd16(d2l,vis_fpadd16(s1l,s2l)),m2l);
        vis_st64_pack2(r1h, r1l, dst);
        vis_st64_pack2(r2h, r2l, dst+rb); dst += 2*rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 2:

    /* ================  INTERPOLATE 8x8 HORIZONTAL  ================== */


    round = vis_to_double_quad(1<<4);
    vis_write_gsr((7-4-1)<<3); /* set fpack16-scale to "shift 4+1 right" */
    scale = vis_to_double_quad(2<<(4+8));  /* expand and multiply by 2 */
    src_a = vis_alignaddr(src,0);

    if(!off) {
      vis_d64 s11,s1n,s12,s11h,s11l,s12h,s12l,d1,m1,d1h,d1l,m1h,m1l,r1h,r1l;
      vis_d64 s21,s2n,s22,s21h,s21l,s22h,s22l,d2,m2,d2h,d2l,m2h,m2l,r2h,r2l;
      /*
       *  src is 8-byte aligned
       */

      vis_alignaddr_const(1);

      for(i=4;i;i--) {
	d1 = vis_ld64(dst);
	d2 = vis_ld64(dst+rb);
	s11 = vis_ld64(src_a);
	s1n = vis_ld64(src_a+8); src_a += rb;
	s21 = vis_ld64(src_a);
	s2n = vis_ld64(src_a+8); src_a += rb;
	m1 = vis_ld64(mot);
	m2 = vis_ld64(mot+rb); mot += 2*rb;
	d1h = vis_fmul8x16_hi(d1,scale); d1l = vis_fmul8x16_lo(d1,scale);
	d2h = vis_fmul8x16_hi(d2,scale); d2l = vis_fmul8x16_lo(d2,scale);
	s12 = vis_faligndata(s11,s1n);
	s22 = vis_faligndata(s21,s2n);
	m1h = vis_fmul8x16_hi(m1,scale); m1l = vis_fmul8x16_lo(m1,scale);
	m2h = vis_fmul8x16_hi(m2,scale); m2l = vis_fmul8x16_lo(m2,scale);
        s11h = vis_fpadd16(round,vis_fexpand_hi(s11));
        s11l = vis_fpadd16(round,vis_fexpand_lo(s11));
        s12h = vis_fexpand_hi(s12);
        s12l = vis_fexpand_lo(s12);
        s21h = vis_fpadd16(round,vis_fexpand_hi(s21));
        s21l = vis_fpadd16(round,vis_fexpand_lo(s21));
        s22h = vis_fexpand_hi(s22);
        s22l = vis_fexpand_lo(s22);
	r1h = vis_fpsub16(vis_fpadd16(d1h,vis_fpadd16(s11h,s12h)),m1h);
	r1l = vis_fpsub16(vis_fpadd16(d1l,vis_fpadd16(s11l,s12l)),m1l);
	r2h = vis_fpsub16(vis_fpadd16(d2h,vis_fpadd16(s21h,s22h)),m2h);
	r2l = vis_fpsub16(vis_fpadd16(d2l,vis_fpadd16(s21l,s22l)),m2l);
	vis_st64_pack2(r1h,r1l, dst);
	vis_st64_pack2(r2h,r2l, dst+rb); dst += 2*rb;
      }
    }
    else if(off == 7) {
      /*
       *  src is NOT 8-byte aligned, however (src+1) is
       */
      vis_d64 s11,s1a,s12,s11h,s11l,s12h,s12l,d1,m1,d1h,d1l,m1h,m1l,r1h,r1l;
      vis_d64 s21,s2a,s22,s21h,s21l,s22h,s22l,d2,m2,d2h,d2l,m2h,m2l,r2h,r2l;

      vis_alignaddr_const(7);

      for(i=4;i;i--) {
	d1 = vis_ld64(dst);
	d2 = vis_ld64(dst+rb);
	s1a = vis_ld64(src_a);
	s12 = vis_ld64(src_a+8); src_a += rb;
	s2a = vis_ld64(src_a);
	s22 = vis_ld64(src_a+8); src_a += rb;
	m1 = vis_ld64(mot);
	m2 = vis_ld64(mot+rb); mot += 2*rb;
	d1h = vis_fmul8x16_hi(d1,scale); d1l = vis_fmul8x16_lo(d1,scale);
	d2h = vis_fmul8x16_hi(d2,scale); d2l = vis_fmul8x16_lo(d2,scale);
	s11 = vis_faligndata(s1a,s12);
	s21 = vis_faligndata(s2a,s22);
	m1h = vis_fmul8x16_hi(m1,scale); m1l = vis_fmul8x16_lo(m1,scale);
	m2h = vis_fmul8x16_hi(m2,scale); m2l = vis_fmul8x16_lo(m2,scale);
        s11h = vis_fpadd16(round,vis_fexpand_hi(s11));
        s11l = vis_fpadd16(round,vis_fexpand_lo(s11));
        s12h = vis_fexpand_hi(s12);
        s12l = vis_fexpand_lo(s12);
        s21h = vis_fpadd16(round,vis_fexpand_hi(s21));
        s21l = vis_fpadd16(round,vis_fexpand_lo(s21));
        s22h = vis_fexpand_hi(s22);
        s22l = vis_fexpand_lo(s22);
	r1h = vis_fpsub16(vis_fpadd16(d1h,vis_fpadd16(s11h,s12h)),m1h);
	r1l = vis_fpsub16(vis_fpadd16(d1l,vis_fpadd16(s11l,s12l)),m1l);
	r2h = vis_fpsub16(vis_fpadd16(d2h,vis_fpadd16(s21h,s22h)),m2h);
	r2l = vis_fpsub16(vis_fpadd16(d2l,vis_fpadd16(s21l,s22l)),m2l);
	vis_st64_pack2(r1h,r1l, dst);
	vis_st64_pack2(r2h,r2l, dst+rb); dst += 2*rb;
      }
    }
    else {
      /*
       *  src is not 8-byte aligned
       */
      vis_d64 s1a,s1b,s11,s12,s11h,s11l,s12h,s12l,d1,m1,d1h,d1l,m1h,m1l,r1h,r1l;
      vis_d64 s2a,s2b,s21,s22,s21h,s21l,s22h,s22l,d2,m2,d2h,d2l,m2h,m2l,r2h,r2l;

      for(i=4;i;i--) {
	s1a = vis_ld64(src_a);
	s1b = vis_ld64(src_a+8); src_a += rb;
	s2a = vis_ld64(src_a);
	s2b = vis_ld64(src_a+8); src_a += rb;
	d1 = vis_ld64(dst);
	d2 = vis_ld64(dst+rb);
	m1 = vis_ld64(mot);
	m2 = vis_ld64(mot+rb); mot += 2*rb;
	s11 = vis_faligndata(s1a,s1b);
	s21 = vis_faligndata(s2a,s2b);
	vis_alignaddr_const(off+1);
	d1h = vis_fmul8x16_hi(d1,scale); d1l = vis_fmul8x16_lo(d1,scale);
	d2h = vis_fmul8x16_hi(d2,scale); d2l = vis_fmul8x16_lo(d2,scale);
	m1h = vis_fmul8x16_hi(m1,scale); m1l = vis_fmul8x16_lo(m1,scale);
	m2h = vis_fmul8x16_hi(m2,scale); m2l = vis_fmul8x16_lo(m2,scale);
	s12 = vis_faligndata(s1a,s1b);
	s22 = vis_faligndata(s2a,s2b);
	vis_alignaddr_const(off);
        s11h = vis_fpadd16(round,vis_fexpand_hi(s11));
        s11l = vis_fpadd16(round,vis_fexpand_lo(s11));
        s12h = vis_fexpand_hi(s12);
        s12l = vis_fexpand_lo(s12);
        s21h = vis_fpadd16(round,vis_fexpand_hi(s21));
        s21l = vis_fpadd16(round,vis_fexpand_lo(s21));
        s22h = vis_fexpand_hi(s22);
        s22l = vis_fexpand_lo(s22);
	r1h = vis_fpsub16(vis_fpadd16(d1h,vis_fpadd16(s11h,s12h)),m1h);
	r1l = vis_fpsub16(vis_fpadd16(d1l,vis_fpadd16(s11l,s12l)),m1l);
	r2h = vis_fpsub16(vis_fpadd16(d2h,vis_fpadd16(s21h,s22h)),m2h);
	r2l = vis_fpsub16(vis_fpadd16(d2l,vis_fpadd16(s21l,s22l)),m2l);
	vis_st64_pack2(r1h,r1l, dst);
	vis_st64_pack2(r2h,r2l, dst+rb); dst += 2*rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 3:

    /* ============  INTERPOLATE 8x8 HORIZONTAL & VERTICAL  ============= */


    round = vis_to_double_quad(2<<4);      /* added to round divide by 4 */
    scale = vis_to_double_quad(4<<(4+8));  /* expand and multiply by 4 */
    vis_write_gsr((7-4-2)<<3);             /* divide by 4 and pack */
    src_a = vis_alignaddr(src,0);

    if(!off) {
      vis_d64 s11,s1n,s12,s21,s22,s2n,s1sh,s1sl,s2sh,s2sl,d1,d2,m1,m2,
	      d1h,d1l,d2h,d2l,m1h,m1l,m2h,m2l,r1h,r1l,r2h,r2l;

      vis_alignaddr_const(1);

      s11 = vis_ld64(src);
      s1n = vis_ld64(src + 8); src += rb;
      s12 = vis_faligndata(s11,s1n);
      s1sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
                                           vis_fexpand_hi(s12)));
      s1sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
                                           vis_fexpand_lo(s12)));

      for(i=4;i;i--) {
	s21 = vis_ld64(src);
	s2n = vis_ld64(src + 8); src += rb;
	s11 = vis_ld64(src);
	s1n = vis_ld64(src + 8); src += rb;
	d1 = vis_ld64(dst);
	d2 = vis_ld64(dst+rb);
	m1 = vis_ld64(mot);
	m2 = vis_ld64(mot+rb); mot += 2*rb;
	s22 = vis_faligndata(s21,s2n);
	s12 = vis_faligndata(s11,s1n);
	d1h = vis_fmul8x16_hi(d1,scale); d1l = vis_fmul8x16_lo(d1,scale);
	d2h = vis_fmul8x16_hi(d2,scale); d2l = vis_fmul8x16_lo(d2,scale);
	m1h = vis_fmul8x16_hi(m1,scale); m1l = vis_fmul8x16_lo(m1,scale);
	m2h = vis_fmul8x16_hi(m2,scale); m2l = vis_fmul8x16_lo(m2,scale);
	/* horizontal sum */
	s2sh = vis_fpadd16(vis_fexpand_hi(s21),vis_fexpand_hi(s22));
	s2sl = vis_fpadd16(vis_fexpand_lo(s21),vis_fexpand_lo(s22));
	/* vertical sum */
	r1h = vis_fpadd16(s1sh,s2sh);
	r1l = vis_fpadd16(s1sl,s2sl);
	/* horizontal & vertical sum, next scanline */
	s1sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
					     vis_fexpand_hi(s12)));
	s1sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
					     vis_fexpand_lo(s12)));
	r2h = vis_fpadd16(s1sh,s2sh);
	r2l = vis_fpadd16(s1sl,s2sl);
	/* add past value and motion vector */
	r1h = vis_fpsub16(vis_fpadd16(r1h,d1h),m1h);
	r1l = vis_fpsub16(vis_fpadd16(r1l,d1l),m1l);
	r2h = vis_fpsub16(vis_fpadd16(r2h,d2h),m2h);
	r2l = vis_fpsub16(vis_fpadd16(r2l,d2l),m2l);
	/* write as 8 bit values */
	vis_st64_pack2(r1h,r1l, dst);
	vis_st64_pack2(r2h,r2l, dst+rb); dst += 2*rb;
      }
    }
    else if(off == 7) {
      /*
       *  src is NOT 8-byte aligned, however (src+1) is
       *  [only src-loads and faligndata is different from case off==0]
       */
      vis_d64 s11,s1a,s12,s21,s22,s2a,s1sh,s1sl,s2sh,s2sl,d1,d2,m1,m2,
	      d1h,d1l,d2h,d2l,m1h,m1l,m2h,m2l,r1h,r1l,r2h,r2l;

      vis_alignaddr_const(7);

      s1a = vis_ld64(src_a);
      s12 = vis_ld64(src_a + 8); src_a += rb;
      s11 = vis_faligndata(s1a,s12);
      s1sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
                                           vis_fexpand_hi(s12)));
      s1sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
                                           vis_fexpand_lo(s12)));

      for(i=4;i;i--) {
	s2a = vis_ld64(src_a);
	s22 = vis_ld64(src_a + 8); src_a += rb;
	s1a = vis_ld64(src_a);
	s12 = vis_ld64(src_a + 8); src_a += rb;
	d1 = vis_ld64(dst);
	d2 = vis_ld64(dst+rb);
	m1 = vis_ld64(mot);
	m2 = vis_ld64(mot+rb); mot += 2*rb;
	s21 = vis_faligndata(s2a,s22);
	s11 = vis_faligndata(s1a,s12);
	/* rest is identical to case off==0 */
	d1h = vis_fmul8x16_hi(d1,scale); d1l = vis_fmul8x16_lo(d1,scale);
	d2h = vis_fmul8x16_hi(d2,scale); d2l = vis_fmul8x16_lo(d2,scale);
	m1h = vis_fmul8x16_hi(m1,scale); m1l = vis_fmul8x16_lo(m1,scale);
	m2h = vis_fmul8x16_hi(m2,scale); m2l = vis_fmul8x16_lo(m2,scale);
	s2sh = vis_fpadd16(vis_fexpand_hi(s21),vis_fexpand_hi(s22));
	s2sl = vis_fpadd16(vis_fexpand_lo(s21),vis_fexpand_lo(s22));
	r1h = vis_fpadd16(s1sh,s2sh);
	r1l = vis_fpadd16(s1sl,s2sl);
	s1sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
					     vis_fexpand_hi(s12)));
	s1sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
					     vis_fexpand_lo(s12)));
	r2h = vis_fpadd16(s1sh,s2sh);
	r2l = vis_fpadd16(s1sl,s2sl);
	r1h = vis_fpsub16(vis_fpadd16(r1h,d1h),m1h);
	r1l = vis_fpsub16(vis_fpadd16(r1l,d1l),m1l);
	r2h = vis_fpsub16(vis_fpadd16(r2h,d2h),m2h);
	r2l = vis_fpsub16(vis_fpadd16(r2l,d2l),m2l);
	vis_st64_pack2(r1h,r1l, dst);
	vis_st64_pack2(r2h,r2l, dst+rb); dst += 2*rb;
      }
    }
    else {
      /*
       *  src is not 8-byte aligned
       *  [only src alignment handling differs from other cases]
       */
      vis_d64 s1a,s1b,s11,s12,s2a,s2b,s21,s22,s1sh,s1sl,s2sh,s2sl,d1,d2,m1,m2,
	      d1h,d1l,d2h,d2l,m1h,m1l,m2h,m2l,r1h,r1l,r2h,r2l;

      s1a = vis_ld64(src_a);
      s1b = vis_ld64(src_a + 8); src_a += rb;
      s11 = vis_faligndata(s1a,s1b);
      vis_alignaddr_const(off+1);
      s12 = vis_faligndata(s1a,s1b);
      vis_alignaddr_const(off);
      s1sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
                                           vis_fexpand_hi(s12)));
      s1sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
                                           vis_fexpand_lo(s12)));

      for(i=4;i;i--) {
	s2a = vis_ld64(src_a);
	s2b = vis_ld64(src_a + 8); src_a += rb;
	s1a = vis_ld64(src_a);
	s1b = vis_ld64(src_a + 8); src_a += rb;
	s21 = vis_faligndata(s2a,s2b);
	s11 = vis_faligndata(s1a,s1b);
	vis_alignaddr_const(off+1);
	d1 = vis_ld64(dst);
	d2 = vis_ld64(dst+rb);
	m1 = vis_ld64(mot);
	m2 = vis_ld64(mot+rb); mot += 2*rb;
	s22 = vis_faligndata(s2a,s2b);
	s12 = vis_faligndata(s1a,s1b);
	vis_alignaddr_const(off);
	d1h = vis_fmul8x16_hi(d1,scale); d1l = vis_fmul8x16_lo(d1,scale);
	d2h = vis_fmul8x16_hi(d2,scale); d2l = vis_fmul8x16_lo(d2,scale);
	m1h = vis_fmul8x16_hi(m1,scale); m1l = vis_fmul8x16_lo(m1,scale);
	m2h = vis_fmul8x16_hi(m2,scale); m2l = vis_fmul8x16_lo(m2,scale);
	/* horizontal sum */
	s2sh = vis_fpadd16(vis_fexpand_hi(s21),vis_fexpand_hi(s22));
	s2sl = vis_fpadd16(vis_fexpand_lo(s21),vis_fexpand_lo(s22));
	/* vertical sum, divide by 4 and round */
	r1h = vis_fpadd16(s1sh,s2sh);
	r1l = vis_fpadd16(s1sl,s2sl);
	/* horizontal & vertical sum, next scanline */
	s1sh = vis_fpadd16(round,vis_fpadd16(vis_fexpand_hi(s11),
					     vis_fexpand_hi(s12)));
	s1sl = vis_fpadd16(round,vis_fpadd16(vis_fexpand_lo(s11),
					     vis_fexpand_lo(s12)));
	r2h = vis_fpadd16(s1sh,s2sh);
	r2l = vis_fpadd16(s1sl,s2sl);
	/* do plain add/sub */
	r1h = vis_fpsub16(vis_fpadd16(r1h,d1h),m1h);
	r1l = vis_fpsub16(vis_fpadd16(r1l,d1l),m1l);
	r2h = vis_fpsub16(vis_fpadd16(r2h,d2h),m2h);
	r2l = vis_fpsub16(vis_fpadd16(r2l,d2l),m2l);
	/* write as 8 bit values */
	vis_st64_pack2(r1h,r1l, dst);
	vis_st64_pack2(r2h,r2l, dst+rb); dst += 2*rb;
      }
    }
  }
}
#endif /* USE_VIS */


void CompensateTPBP_MB(Byte *ydec, Byte *udec, Byte *vdec, 
		       Byte *yref, Byte *uref, Byte *vref,
		       Byte *ymot, Byte *umot, Byte *vmot,
		       int fx,    int fy) {

  int sx, sy, xh, yh;

  sx = (fx>>1); sy = (fy>>1);
  xh = fx&1; yh = fy&1;

  yref += (sy*Yrowbytes) + sx;

  CompensateBPHalfPelMotion16(ydec, yref, ymot, xh, yh, Yrowbytes);

  /*
  fx = ( fx % 4 == 0 ? fx >> 1 : (fx>>1)|1 );
  fy = ( fy % 4 == 0 ? fy >> 1 : (fy>>1)|1 );
  */

  fx >>= 1; fy >>= 1;

  sx = fx>>1; sy = fy>>1;
  xh = fx&1; yh = fy&1;

  uref += (sy*Crowbytes) + sx;
  CompensateBPHalfPelMotion8(udec, uref, umot, xh, yh, Crowbytes);

  vref += (sy*Crowbytes) + sx;
  CompensateBPHalfPelMotion8(vdec, vref, vmot, xh, yh, Crowbytes);
}

