/*
 *  Interpolation.c
 *
 *  This modules contains procedures to upscale 4x4 and 8x8 blocks
 *  as well as whole (I-)frames. The filter uses bilinear interpolation
 *  with the weights [1/4,3/4,3/4,1/4].
 *
 *  External Interface:
 *
 *    InterpolateImage		up-scale a single image
 *    InterpolateFrame		up-scale complete YUV frame (I-frame)
 *    InterpolateBlockHV	up-scale a single block
 *
 *  Internal Procedures:
 *
 *    InterpolateHV_orig
 *    InterpolateHV_1pass
 *    InterpolateHV_2pass
 *
 *    InterBorderBlock
 *    InterInnerBlock_orig
 *    InterInnerBlock_4x4_2pass
 *    InterInnerBlock_8x8_2pass
 *    InterInnerBlock_4x4_1pass
 *    InterInnerBlock_8x8_1pass
 *    InterInnerBlock_4x4_2pass_vis
 *    InterInnerBlock_8x8_2pass_vis
 *    InterInnerBlock_4x4_1pass_vis
 *    InterInnerBlock_8x8_1pass_vis
 *
 *  The HV procedures have not yet been acclerated by VIS, since they are
 *  needed for I-frames only, which occur sparsely.
 *
 *  $Id: Interpolation.c,v 1.1.1.1 1997/12/16 13:14:40 jnweiger Exp $
 */
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <time.h>
#if defined(__sun) && defined(FILE)
# include <sys/time.h>		/* SunOS 4 has struct timeval there */
#endif
#include <sys/resource.h>

#include "global.h"

#include "Interpolation.h"
#include "vis.h"



/*
 *  Configuration options
 *  don't touch - unless for performance comparison
 */

#undef CLASSIC           /* disable original 2-pass procedures */
#define IFRAME_1PASS     /* use 1-pass implementation on complete images */

                         /* with VIS, 1-pass is slower on single blocks */
#undef BLOCK_1PASS_VIS   /* enable 2-pass procedures on blocks */
                         /* using pure Integer, 1-pass is faster on blocks */
#define BLOCK_1PASS      /* enable 1-pass procedures on blocks */


static int savemem1[256];

/* ------------------------------------------------------------------------ */
/*
 *  Precompute results of multiplication and addition of rounding constants.
 *  (multiplication is a multi-cycle instruction and hence slower than a load)
 *  Called once upon start of program.
 */

static unsigned short I32Tab[256];
#ifndef CLASSIC
static unsigned short I3Tab[256], I38Tab[3*256], I98Tab[256];
#endif

void InitInterpolator()
{
  int i;

  for (i=0; i<256; i++) {
    I32Tab[i] = i * 3 + 2;
#ifndef CLASSIC
    I3Tab[i] = i * 3;
    I98Tab[i] = i * 9 + 8;   /* +8 for rouding */
  }
  for (i=0; i<3*256; i++) {
    I38Tab[i] = i * 3 + 8;
#endif
  }
}

/* ------------------------------------------------------------------------ */
/*
 *  2-Pass interpolation on I-frame.
 *  Original implementation, barely optimized.
 */
#ifdef CLASSIC

static void InterpolateHV_orig (Byte *ii, Byte *oi, int iw, int ih)
{
  register Byte *inData, *outData;
  register Byte x1,x2,x3;
  register unsigned short t;

  int i,j;
  int ow = iw<<1;
  int ow2 = ow<<1;
  int ow4 = ow2<<1;
  int oh = ih<<1;

  /* Horizontal upsampling */

  inData = ii;
 
  for (j=0; j<ih; j++, inData+=iw) {
    x1 = inData[0];
    x2 = x1;
    x3 = inData[1];
    outData = oi+(j*ow2);
    for (i=0; i<(iw-1); i++, outData+=2) {
      t=I32Tab[x2];
      outData[0] = (Byte) ((t+x1)>>2);
      outData[1] = (Byte) ((t+x3)>>2);
      x1 = x2;
      x2 = x3;
      x3 = inData[i+2];
    }
    t=I32Tab[x2];
    /* Last pixel in row (x2) is replicated */
    outData[0] = (Byte) ((t+x1)>>2);
    outData[1] = (Byte) ((t+x2)>>2);
  }

  /* Vertical upsampling */

  for (i=0; i<ow; i++) {
    outData = oi + i;
    x1 = outData[0];
    x2 = x1;
    x3 = outData[ow2];
    for (j=0; j<oh-4; j+=2, outData+=(ow2)) {
      t=I32Tab[x2];
      outData[0] = (t+x1)>>2;
      outData[ow] = (t+x3)>>2;
      x1 = x2;
      x2 = x3;
      x3 = outData[ow4];
    }
    /* Replicate border pixels */
    t=I32Tab[x2];
    outData[0] = (t+x1)>>2;
    outData[ow] = (t+x3)>>2;
    t=I32Tab[x3];
    outData[ow2] = (t+x2)>>2;
    outData[ow2+ow] = (t+x3)>>2;
  }
}
#else /* not CLASSIC */

/* ------------------------------------------------------------------------ */
/*
 *  2-Pass interpolation on I-frame
 *
 *  (There is no VIS counterpart yet.)
 */
#ifndef IFRAME_1PASS

static void InterpolateHV_2pass (Byte *ii, Byte *oi, int iw, int ih)
{
  Byte *inData, *outByte;
  long *outData, out1,out2,out3,out4;
  Byte x1,x2,x3;
  unsigned short t;

  int i,j;
  int ow = iw<<1;
  int ow2 = ow<<1;
  int ow4 = ow2<<1;
  int oh = ih<<1;

  /* Horizontal upsampling */

  inData = ii;
 
  for (j=0; j<ih; j++) {
    x1 = inData[0];
    x2 = x1;
    x3 = inData[1];
    inData += 2;
    outData = (long *)((long)oi+(j*ow2));
    for (i=(iw>>1)-1; i; i--) {
      t = I32Tab[x2];
      out1 = ((t+x1) & 0x3fc) << 22;
      out2 = ((t+x3) & 0x3fc) << 14;
      t = I32Tab[x3];
      out3 = ((t+x2) & 0x3fc) << 6;
      x2 = *inData++;
      out4 = (t+x2) >> 2;
      x1 = x3;
      x3 = *inData++;
      *outData++ = out1|out2|out3|out4;
    }
    /* process rest in line */
    t=I32Tab[x2];
    out1 = ((t+x1) & 0x3fc) << 22;
    out2 = ((t+x3) & 0x3fc) << 14;
    t=I32Tab[x3];
    /* Last pixel in row (x2) is replicated */
    out3 = ((t+x2) & 0x3fc) << 6;
    out4 = (t+x3) >> 2;
    *outData++ = out1|out2|out3|out4;
  }

  /* Vertical upsampling */

  for (i=0; i<ow; i++) {
    outByte = oi + i;
    x1 = outByte[0];
    x2 = x1;
    x3 = outByte[ow2];
    for (j=0; j<oh-4; j+=2, outByte+=(ow2)) {
      t=I32Tab[x2];
      outByte[0] = (t+x1)>>2;
      outByte[ow] = (t+x3)>>2;
      x1 = x2;
      x2 = x3;
      x3 = outByte[ow4];
    }
    /* Replicate border pixels */
    t=I32Tab[x2];
    outByte[0] = (t+x1)>>2;
    outByte[ow] = (t+x3)>>2;
    t=I32Tab[x3];
    outByte[ow2] = (t+x2)>>2;
    outByte[ow2+ow] = (t+x3)>>2;
  }
}
#else /* IFRAME_1PASS */

/* ------------------------------------------------------------------------ */
/*
 *  1-Pass interpolation on I-frame
 *
 *  There is no VIS counterpart yet.
 */

static void InterpolateHV_1pass (Byte *ii, Byte *oi, int iw, int ih)
{
  Byte *outData, *lastData, *lii;
  int i,j;
  int ow = iw * 2;
  int ns1, ns2;
  /*
   *  variable naming: i for "input", m for "minus", p for "plus"
   *    w for width as in variable iw. Hence: ipwm1 = *(ii+iw-1).
   *    appended 3 means variable contains 3 times the value.
   */
  int imwm1, imw, imw_3, im1, iv, imwp1, ip1, ipwm1, ipw, ipw_3, ipwp1;
  int lim1, limwm1, lipwm1;
  int imwp2, ip2, ipwp2, out11,out12,out13,out14,out21,out22,out23,out24;

  outData = oi;
  lii = ii + (ih-1)*iw;
  lastData = oi + 2*(ih-1)*ow;

  /**
   **  Deal with first and last scanline
   **/
  im1 = *ii;
  ipwm1 = *(ii+iw);
  lim1 = *lii;
  limwm1 = *(lii-iw);
  for (i=iw-1; i; i--) {
    iv = *ii;
    ip1 = *(ii+1); 

    /* scanline above image is assumed to be identical to top line */
    imwm1 = im1;

    imw_3 = I3Tab[iv];
    imwp1 = ip1;

    ipw_3 = I3Tab[*(ii+iw)];
    ipwp1 = *(ii+iw+1);

    ns1 = I3Tab[im1] + I98Tab[iv];
    ns2 = I98Tab[iv] + I3Tab[ip1];

    *outData = (imwm1 + imw_3 + ns1)>>4;
    *(outData+1) = (imw_3 + imwp1 + ns2)>>4;
    *(outData+ow) = (ns1 + ipwm1 + ipw_3)>>4;
    *(outData+ow+1) = (ns2 + ipw_3 + ipwp1)>>4;
    outData += 2;
    ipwm1 = *(ii+iw);
    im1 = iv;
    ii++;

    /* scanline below image is assumed to be identical with last line */
    imw_3 = I3Tab[*(lii-iw)];
    imwp1 = *(lii-iw+1);

    iv = *lii;
    ip1 = *(lii+1);
    lipwm1 = lim1;
    ipw_3 = I3Tab[iv];
    ipwp1 = ip1;

    ns1 = I3Tab[lim1] + I98Tab[iv];
    ns2 = I98Tab[iv] + I3Tab[ip1];

    *lastData = (limwm1 + imw_3 + ns1)>>4;
    *(lastData+1) = (imw_3 + imwp1 + ns2)>>4;
    *(lastData+ow) = (ns1 + lipwm1 + ipw_3)>>4;
    *(lastData+ow+1) = (ns2 + ipw_3 + ipwp1)>>4;
    lastData += 2;
    limwm1 = *(lii-iw);
    lim1 = iv;
    lii++;
  }
  /* deal with last pixel of first & last scanline */
  iv = *ii;
  ip1 = iv;
  imwm1 = im1;
  imw_3 = I3Tab[iv];
  imwp1 = ip1;
  ipwm1 = *(ii+iw-1);
  ipw_3 = I3Tab[*(ii+iw)];
  ipwp1 = *(ii+iw);
  ns1 = I3Tab[im1] + I98Tab[iv];
  ns2 = I98Tab[iv] + I3Tab[ip1];
  *outData = (imwm1 + imw_3 + ns1)>>4;
  *(outData+1) = (imw_3 + imwp1 + ns2)>>4;
  *(outData+ow) = (ns1 + ipwm1 + ipw_3)>>4;
  *(outData+ow+1) = (ns2 + ipw_3 + ipwp1)>>4;
  outData += 2;
  ii++;

  imwm1 = *(lii-iw-1);
  imw_3 = I3Tab[*(lii-iw)];
  imwp1 = *(lii-iw);
  iv = *lii;
  ip1 = iv;
  ipwm1 = lim1;
  ipw_3 = I3Tab[iv];
  ipwp1 = iv;
  ns1 = I3Tab[lim1] + I98Tab[iv];
  ns2 = I98Tab[iv] + I3Tab[ip1];
  *lastData = (imwm1 + imw_3 + ns1)>>4;
  *(lastData+1) = (imw_3 + imwp1 + ns2)>>4;
  *(lastData+ow) = (ns1 + ipwm1 + ipw_3)>>4;
  *(lastData+ow+1) = (ns2 + ipw_3 + ipwp1)>>4;
  outData += ow;

  /**
   **  Blow each input pixel up to 4 interpolated pixels in output image
   **  unrolled one time
   **  - to spare a few register moves
   **  - to do 32-bit writes
   **/
  for (j=ih-2; j; j--) {
    iv = *ii;
    ipw = *(ii+iw);
    im1 = iv;          /* -1 omitted on leftmost pixel in line */
    imw = *(ii-iw);
    imwm1 = imw;
    ipwm1 = ipw;
    for (i=iw/2-1; i; i--) {
      imwp1 = *(ii-iw+1);
      ip1 = *(ii+1);
      ipwp1 = *(ii+iw+1);

      imw_3 = I3Tab[imw];
      ipw_3 = I3Tab[ipw];

      ns1 = I3Tab[im1] + I98Tab[iv];
      ns2 = I98Tab[iv] + I3Tab[ip1];

      out11 = imwm1 + imw_3 + ns1;
      out12 = imw_3 + imwp1 + ns2;
      out21 = ns1 + ipwm1 + ipw_3;
      out22 = ns2 + ipw_3 + ipwp1;

      /* unrolled */

      imwp2 = *(ii-iw+2);
      ip2 = *(ii+2);
      ipwp2 = *(ii+iw+2);

      imw_3 = I3Tab[imwp1];
      ipw_3 = I3Tab[ipwp1];

      ns1 = I3Tab[iv] + I98Tab[ip1];
      ns2 = I98Tab[ip1] + I3Tab[ip2];

      out13 = imw + imw_3 + ns1;
      out14 = imw_3 + imwp2 + ns2;
      out23 = ns1 + ipw + ipw_3;
      out24 = ns2 + ipw_3 + ipwp2;

      *((long *)outData) =
                 ((out11 & 0xff0)<<20) | ((out12 & 0xff0)<<12) |
                 ((out13 & 0xff0)<<4) | (out14>>4);
      *((long *)(outData + ow)) =
                 ((out21 & 0xff0)<<20) | ((out22 & 0xff0)<<12) |
                 ((out23 & 0xff0)<<4) | (out24>>4);
      outData += 4;
      ii += 2;
      imwm1 = imwp1;
      im1 = ip1;
      ipwm1 = ipwp1;
      imw = imwp2;
      iv = ip2;
      ipw = ipwp2;
    }
    /*
     *  Deal with last pixel(s) in scanline seperately
     *  - the 2nd last would need no special handling, but since
     *    we want to do 32-bit stroes (aka 2 pels at a time) above...
     *  - for interpolation pel right to image is same as rightmost
     *    (same as with all other borders)
     */
    imwp1 = *(ii-iw+1);
    ip1 = *(ii+1);
    ipwp1 = *(ii+iw+1);
    imw_3 = I3Tab[imw];
    ipw_3 = I3Tab[ipw];
    ns1 = I3Tab[im1] + I98Tab[iv];
    ns2 = I98Tab[iv] + I3Tab[ip1];
    out11 = imwm1 + imw_3 + ns1;
    out12 = imw_3 + imwp1 + ns2;
    out21 = ns1 + ipwm1 + ipw_3;
    out22 = ns2 + ipw_3 + ipwp1;

    /* now the very last one */
    imw_3 = I3Tab[imwp1];
    ipw_3 = I3Tab[ipwp1];
    ns1 = I3Tab[iv] + I98Tab[ip1];
    ns2 = I98Tab[ip1] + I3Tab[ip1];
    out13 = imw + imw_3 + ns1;
    out14 = imw_3 + imwp1 + ns2;
    out23 = ns1 + ipw + ipw_3;
    out24 = ns2 + ipw_3 + ipwp1;

    *((long *)outData) =
	       ((out11 & 0xff0)<<20) | ((out12 & 0xff0)<<12) |
	       ((out13 & 0xff0)<<4) | (out14>>4);
    *((long *)(outData + ow)) =
	       ((out21 & 0xff0)<<20) | ((out22 & 0xff0)<<12) |
	       ((out23 & 0xff0)<<4) | (out24>>4);
    ii += 2;
    outData += ow + 4;
  }
}
#endif /* IFRAME_1PASS */
#endif /* not CLASSIC */

/* ------------------------------------------------------------------------ */
/*
 *  Interpolate border block of any size
 *
 *  (there is no optimized or VIS counterpart for this procedure)
 */

static void InterBorderBlock (Byte *ii, Byte *oi, int sx, int sy, int bw, int 
bh, int iw, int ih) {
  Byte *inData,*out2;
  int *out1,*in2;
  int x1,x2,x3;
  int t;

  int i,j;
  int ow = iw<<1;
  int ow2 = ow<<1;

  /* Horizontal upsampling */

  out1 = savemem1;

  for (j=0; j<bh+2; j++, out1+=16) {
    if ((sy+j-1)<0) inData = ii;
    else if ((sy+j-1)>=ih) inData = ii + ((ih-1)*iw);
    else inData  = ii + ((sy+j-1)*iw);
    if (sx==0) {
      x1 = inData[sx];
      x2 = inData[sx];
      x3 = inData[sx+1];
    }
    else {
      x1 = inData[sx-1];
      x2 = inData[sx];
      x3 = inData[sx+1];
    }
    for (i=0; i<bw; i++) {
      t= 3 * x2;
      out1[(i<<1)  ] = x1 + t;
      out1[(i<<1)+1] = t + x3;
      x1 = x2;
      x2 = x3;
      if ((sx+2+i)>=iw) x3 = inData[iw-1];
      else x3 = inData[sx+2+i];
    }
  }

  /* Vertical upsampling */
  out2 = oi + (ow2*sy) + (2*sx);
  for (i=0; i<(2*bw); i++, out2++) {
    x1 = savemem1[i];
    x2 = savemem1[16+i];
    x3 = savemem1[32+i];
    in2 = savemem1+48+i;
    for (j=0; j<bh; j++, in2+=16) {
      t= 3 * x2;
      out2[(j*ow2)]    = (t+x1+8)>>4;
      out2[(j*ow2)+ow] = (t+x3+8)>>4;
      x1 = x2;
      x2 = x3;
      x3 = in2[0];
    }
  }
}

/* ------------------------------------------------------------------------ */
/*
 *  Interpolate inner block of any size.
 *
 *  Original 2-pass routine, barely optmized.
 */
#ifdef CLASSIC

static void InterInnerBlock_orig (Byte *ii, Byte *oi, int sx, int sy, int bw, int bh, int iw) {
  Byte *inData, *out2;
  int *in2, *out1;
  int x1,x2,x3;
  int t;

  int i,j;
  int ow = iw<<1;
  int ow2 = ow<<1;

  /* Horizontal upsampling */

  inData  = ii + ((sy-1)*iw) + sx - 1;
  out1 = savemem1;

  for (j=0; j<bh+2; j++, inData+=iw, out1+=16) {
    x1 = inData[0];
    x2 = inData[1];
    x3 = inData[2];
    for (i=0; i<bw; i++) {
      t = 3 * x2;
      out1[i*2] = t + x1;
      out1[i*2+1] = t + x3;
      x1 = x2;
      x2 = x3;
      x3 = inData[3+i];
    }
  }
  /* Vertical upsampling */
  out2 = oi + (ow2*sy) + (2*sx);

  for (i=0; i<(2*bw); i++, out2++) {
    x1 = savemem1[i];
    x2 = savemem1[16+i];
    x3 = savemem1[32+i];
    in2 = savemem1+48+i;
    for (j=0; j<bh; j++, in2+=16) {
      t= 3 * x2;
      out2[(j*ow2)]    = (t+x1+8)>>4;
      out2[(j*ow2)+ow] = (t+x3+8)>>4;
      x1 = x2;
      x2 = x3;
      x3 = *in2;
    }
  }
}

#else /* not CLASSIC */

/* ------------------------------------------------------------------------ */
/*
 *  Interpolate 4x4 block, 2-pass version
 */
#ifndef BLOCK_1PASS

static void InterInnerBlock_4x4_2pass(ii, oi, sx, sy, iw)
Byte *ii,*oi;
int sx,sy,iw;
{
  Byte *inData, *out2;
  int *in2, *out1;
  int i,j;
  int ow = iw<<1;
  int x0,x1,x2,x3,x4,x5, x13,x23,x33,x43, o0,o1,o2,o3,o4,o5,o6,o7;

  /*
   *  1st pass: horizontal upsampling
   *  - generate two pixels from one in each scanline
   *  - have to interpolate also one scanline above and below block
   *    repectively, for the following vertical pass
   *  - output is written in a small buffer
   */

  inData  = ii + ((sy-1)*iw) + sx - 1;
  out1 = savemem1;

  for (j=6; j; j--) {
    out1[0]=inData[0]+I3Tab[inData[1]]; out1[1]=I3Tab[inData[1]]+inData[2];
    out1[2]=inData[1]+I3Tab[inData[2]]; out1[3]=I3Tab[inData[2]]+inData[3];
    out1[4]=inData[2]+I3Tab[inData[3]]; out1[5]=I3Tab[inData[3]]+inData[4];
    out1[6]=inData[3]+I3Tab[inData[4]]; out1[7]=I3Tab[inData[4]]+inData[5];
    inData += iw;
    out1 += 8;
  }

  /*
   * 2nd pass: vertical upsampling
   * - interpolate between scanlines in buffer
   * - ugly code organization neccessary to overcome severe
   *   compiler-optimization failures (RAWs, load-use stalls)
   */

  out2 = oi + (ow*sy*2) + (2*sx);
  in2 = savemem1;

  for (i=8; i; i--) {
    x0=in2[0]; x1=in2[8]; x2=in2[16];
    x13 = 3*x1+8; x23 = 3*x2+8;
    o0 = (x0 + x13)>>4;
    o1 = (x13 + x2)>>4;
    o2 = (x1 + x23)>>4;
    *out2 = o0; out2 += ow;
    *out2 = o1; out2 += ow;
    *out2 = o2; out2 += ow;
    x3=in2[24]; x4=in2[32]; x5=in2[40];
    x33 = 3*x3+8; x43 = 3*x4+8;
    o3 = (x23 + x3)>>4;
    o4 = (x2 + x33)>>4;
    o5 = (x33 + x4)>>4;
    o6 = (x3 + x43)>>4;
    o7 = (x43 + x5)>>4;
    *out2 = o3; out2 += ow;
    *out2 = o4; out2 += ow;
    *out2 = o5; out2 += ow;
    *out2 = o6; out2 += ow;
    *out2 = o7;

    out2 = out2 - 7*ow + 1;
    in2++;
  }
}

/* ------------------------------------------------------------------------ */
/*
 *  Interpolate 8x8 block, 2-pass version
 */

static void InterInnerBlock_8x8_2pass(ii, oi, sx, sy, iw)
Byte *ii,*oi;
int sx,sy,iw;
{
  Byte *inData, *out2;
  int *in2, *out1;
  int i,j;
  int ow = iw<<1;
  int x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,
      x13,x23,x33,x43,x53,x63,x73,x83,
      o0,o1,o2,o3,o4,o5,o6,o7,o8,o9,o10,o11,o12,o13,o14,o15;

  /* Horizontal upsampling */

  inData  = ii + ((sy-1)*iw) + sx - 1;
  out1 = savemem1;

  for (j=10; j; j--) {
    out1[0]=inData[0]+I3Tab[inData[1]]; out1[1]=I3Tab[inData[1]]+inData[2];
    out1[2]=inData[1]+I3Tab[inData[2]]; out1[3]=I3Tab[inData[2]]+inData[3];
    out1[4]=inData[2]+I3Tab[inData[3]]; out1[5]=I3Tab[inData[3]]+inData[4];
    out1[6]=inData[3]+I3Tab[inData[4]]; out1[7]=I3Tab[inData[4]]+inData[5];
    out1[8]=inData[4]+I3Tab[inData[5]]; out1[9]=I3Tab[inData[5]]+inData[6];
    out1[10]=inData[5]+I3Tab[inData[6]]; out1[11]=I3Tab[inData[6]]+inData[7];
    out1[12]=inData[6]+I3Tab[inData[7]]; out1[13]=I3Tab[inData[7]]+inData[8];
    out1[14]=inData[7]+I3Tab[inData[8]]; out1[15]=I3Tab[inData[8]]+inData[9];
    inData += iw;
    out1 += 16;
  }

  /* Vertical upsampling */

  out2 = oi + (ow*sy*2) + (2*sx);
  in2 = savemem1;

  for (i=16; i; i--) {
    x0=in2[0]; x1=in2[16]; x2=in2[32];
    x13 = 3*x1+8; x23 = 3*x2+8;
    o0 = (x0 + x13)>>4;
    o1 = (x13 + x2)>>4;
    o2 = (x1 + x23)>>4;
    *out2 = o0; out2 += ow;
    *out2 = o1; out2 += ow;
    *out2 = o2; out2 += ow;
    x3=in2[48]; x4=in2[64]; x5=in2[80];
    x33 = 3*x3+8; x43 = 3*x4+8; x53 = 3*x5+8;
    o3 = (x23 + x3)>>4;
    o4 = (x2 + x33)>>4;
    o5 = (x33 + x4)>>4;
    o6 = (x3 + x43)>>4;
    o7 = (x43 + x5)>>4;
    o8 = (x4 + x53)>>4;
    *out2 = o3; out2 += ow;
    *out2 = o4; out2 += ow;
    *out2 = o5; out2 += ow;
    *out2 = o6; out2 += ow;
    *out2 = o7; out2 += ow;
    *out2 = o8; out2 += ow;
    x6=in2[96]; x7=in2[112]; x8=in2[128];
    x63 = 3*x6+8; x73 = 3*x7+8; x83 = 3*x8+8;
    o9 = (x53 + x6)>>4;
    o10 = (x5 + x63)>>4;
    o11 = (x63 + x7)>>4;
    o12 = (x6 + x73)>>4;
    o13 = (x73 + x8)>>4;
    *out2 = o9; out2 += ow;
    *out2 = o10; out2 += ow;
    *out2 = o11; out2 += ow;
    *out2 = o12; out2 += ow;
    *out2 = o13; out2 += ow;
    x9=in2[144];
    o14 = (x7 + x83)>>4;
    o15 = (x83 + x9)>>4;
    *out2 = o14; out2 += ow;
    *out2 = o15;
    out2 = out2 - 15*ow + 1;
    in2++;
  }
}
#else /* BLOCK_1PASS */

/* ------------------------------------------------------------------------ */
/*
 *  Interpolate 4x4 block, 1-pass version
 *
 *  Separating 4x4 and 8x8 interpolation allows for better register
 *  allocation and results in significant speed-up.
 */

static void InterInnerBlock_4x4_1pass (ii, oi, sx, sy, iw)
Byte *ii,*oi;
int sx,sy, iw;
{
  Byte *inData, *outData, *iBase, *oBase;
  int imwm1, imw, imw_3, im1, iv, imwp1, ip1, ipwm1, ipw, ipw_3, ipwp1;
  int imwp2, ip2, ipwp2, ns1,ns2;
  int ow = iw<<1;
  int j;
  long out1,out2;

  oBase = oi + (ow*2*sy) + (2*sx);
  iBase  = ii + (sy*iw) + sx;

  for (j=4; j; j--) {
    outData = oBase; oBase += ow*2;
    inData = iBase;  iBase += iw;
    iv = *inData;
    ipw = *(inData+iw);
    im1 = *(inData-1);
    imw = *(inData-iw);
    imwm1 = *(inData-iw-1);
    ipwm1 = *(inData+iw-1);

    /* first pixel */
    imwp1 = *(inData-iw+1);
    ip1 = *(inData+1);
    ipwp1 = *(inData+iw+1);

    imw_3 = I3Tab[imw];
    ipw_3 = I3Tab[ipw];

    ns1 = I3Tab[im1] + I98Tab[iv];
    ns2 = I98Tab[iv] + I3Tab[ip1];

    out1 =  ((imwm1 + imw_3 + ns1) & 0xff0)<<20;
    out1 |= ((imw_3 + imwp1 + ns2) & 0xff0)<<12;
    out2 =  ((ns1 + ipwm1 + ipw_3) & 0xff0)<<20;
    out2 |= ((ns2 + ipw_3 + ipwp1) & 0xff0)<<12;

    /* second pixel */
    imwp2 = *(inData-iw+2);
    ip2 = *(inData+2);
    ipwp2 = *(inData+iw+2);

    imw_3 = I3Tab[imwp1];
    ipw_3 = I3Tab[ipwp1];

    ns1 = I3Tab[iv] + I98Tab[ip1];
    ns2 = I98Tab[ip1] + I3Tab[ip2];

    out1 |= ((imw + imw_3 + ns1) & 0xff0)<<4;
    out1 |= (imw_3 + imwp2 + ns2)>>4;
    out2 |= ((ns1 + ipw + ipw_3) & 0xff0)<<4;
    out2 |= (ns2 + ipw_3 + ipwp2)>>4;

    *((long *)outData) = out1;
    *((long *)(outData + ow)) = out2;

    /* third pixel */
    imwm1 = imwp1;
    im1 = ip1;
    ipwm1 = ipwp1;
    imw = imwp2;
    iv = ip2;
    ipw = ipwp2;
    
    imwp1 = *(inData-iw+3);
    ip1 = *(inData+3);
    ipwp1 = *(inData+iw+3);

    imw_3 = I3Tab[imw];
    ipw_3 = I3Tab[ipw];

    ns1 = I3Tab[im1] + I98Tab[iv];
    ns2 = I98Tab[iv] + I3Tab[ip1];

    out1 =  ((imwm1 + imw_3 + ns1) & 0xff0)<<20;
    out1 |= ((imw_3 + imwp1 + ns2) & 0xff0)<<12;
    out2 =  ((ns1 + ipwm1 + ipw_3) & 0xff0)<<20;
    out2 |= ((ns2 + ipw_3 + ipwp1) & 0xff0)<<12;

    /* fourth pixel */
    imwp2 = *(inData-iw+4);
    ip2 = *(inData+4);
    ipwp2 = *(inData+iw+4);

    imw_3 = I3Tab[imwp1];
    ipw_3 = I3Tab[ipwp1];

    ns1 = I3Tab[iv] + I98Tab[ip1];
    ns2 = I98Tab[ip1] + I3Tab[ip2];

    out1 |= ((imw + imw_3 + ns1) & 0xff0)<<4;
    out1 |= (imw_3 + imwp2 + ns2)>>4;
    out2 |= ((ns1 + ipw + ipw_3) & 0xff0)<<4;
    out2 |= (ns2 + ipw_3 + ipwp2)>>4;

    *((long *)(outData + 4)) = out1;
    *((long *)(outData + 4 + ow)) = out2;

    outData += 8;
    inData += 4;
  }
}
/* ------------------------------------------------------------------------ */
/*
 *  Interpolate 8x8 block, 1-pass version
 */

static void InterInnerBlock_8x8_1pass (ii, oi, sx, sy, iw)
Byte *ii,*oi;
int sx,sy,iw;
{
  Byte *inData, *outData, *iBase, *oBase;
  int imwm1, imw, imw_3, im1, iv, imwp1, ip1, ipwm1, ipw, ipw_3, ipwp1;
  int imwp2, ip2, ipwp2, ns1,ns2;
  int ow = iw<<1;
  int i,j;
  long out1, out2;

  oBase = oi + (ow*2*sy) + (2*sx);
  iBase  = ii + (sy*iw) + sx;

  for (j=8; j; j--) {
    outData = oBase; oBase += ow*2;
    inData = iBase;  iBase += iw;
    iv = *inData;
    ipw = *(inData+iw);
    im1 = *(inData-1);
    imw = *(inData-iw);
    imwm1 = *(inData-iw-1);
    ipwm1 = *(inData+iw-1);
    for (i=4; i; i--) {
      imwp1 = *(inData-iw+1);
      ip1 = *(inData+1);
      ipwp1 = *(inData+iw+1);

      imw_3 = I3Tab[imw];
      ipw_3 = I3Tab[ipw];

      ns1 = I3Tab[im1] + I98Tab[iv];
      ns2 = I98Tab[iv] + I3Tab[ip1];

      out1 =  ((imwm1 + imw_3 + ns1) & 0xff0)<<20;
      out1 |= ((imw_3 + imwp1 + ns2) & 0xff0)<<12;
      out2 =  ((ns1 + ipwm1 + ipw_3) & 0xff0)<<20;
      out2 |= ((ns2 + ipw_3 + ipwp1) & 0xff0)<<12;

      imwp2 = *(inData-iw+2);
      ip2 = *(inData+2);
      ipwp2 = *(inData+iw+2);

      imw_3 = I3Tab[imwp1];
      ipw_3 = I3Tab[ipwp1];

      ns1 = I3Tab[iv] + I98Tab[ip1];
      ns2 = I98Tab[ip1] + I3Tab[ip2];

      out1 |= ((imw + imw_3 + ns1) & 0xff0)<<4;
      out1 |= (imw_3 + imwp2 + ns2)>>4;
      out2 |= ((ns1 + ipw + ipw_3) & 0xff0)<<4;
      out2 |= (ns2 + ipw_3 + ipwp2)>>4;

      *((long *)outData) = out1;
      *((long *)(outData + ow)) = out2;

      outData += 4;
      inData += 2;
      imwm1 = imwp1;
      im1 = ip1;
      ipwm1 = ipwp1;
      imw = imwp2;
      iv = ip2;
      ipw = ipwp2;
    }
  }
}
#endif /* BLOCK_1PASS */

#ifndef WITHOUT_VIS

/* ------------------------------------------------------------------------ */
/*   
 *   Interpolate 4x4 block to 8x8 using Sparc-V9 VIS extensions
 *
 *   2-pass version (identical in speed to 1-pass version)
 */
#ifndef BLOCK_1PASS_VIS

static void InterInnerBlock_4x4_2pass_vis (ii, oi, sx, sy, iw)
Byte *ii,*oi;
int sx,sy, iw;
{
  Byte *inData, *out1;
  vis_d64 *out2;
  vis_d64 fac_1313 = vis_to_double_dup(0x01000300),
          fac_3131 = vis_to_double_dup(0x03000100),
          round    = vis_to_double_quad(8);
  vis_d64 lc_m1,lc,lc_p1,lcc,lccl,lccr,l1a,l1b,l1c,l1d,lcab3h,lcab3l,
	  l0h,l1h,l2h,l3h,l4h,l5h, l0l,l1l,l2l,l3l,l4l,l5l,
	  l1_3h,l2_3h,l3_3h,l4_3h, l1_3l,l2_3l,l3_3l,l4_3l;
  int j,misaligned;

  /* Horizontal upsampling */

  vis_write_gsr(((7-4)<<3) | 7);    /* divide by 16, shift by 7 */
  out1 = (Byte*)savemem1;
  /* in is only 4-byte aligned, depending on x-offset */
  inData  = (Byte*)(((long)ii + ((sy-1)*iw) + sx) & ~0x7);
  misaligned = sx & 4;

  if(misaligned) {
    for (j=6; j; j--) {
      lc  = vis_ld64(inData);
      lc_p1 = vis_ld64(inData + 8); inData += iw;

      lcc = vis_fpmerge_lo(lc,lc);
      lccl = vis_faligndata_hi1(lc,lcc);
      vis_alignaddr_const(1);
      lccr = vis_faligndata(lcc,lc_p1);
      vis_alignaddr_const(7);

      l1a = vis_fmul8x16_hi(lccl,fac_1313);
      l1b = vis_fmul8x16_hi(lccr,fac_3131);
      l1c = vis_fmul8x16_lo(lccl,fac_1313);
      l1d = vis_fmul8x16_lo(lccr,fac_3131);
      lcab3h = vis_fpadd16(l1a, l1b);
      lcab3l = vis_fpadd16(l1c, l1d);
      vis_st64(lcab3h, out1);
      vis_st64(lcab3l, out1+8); out1 += 16;
    }
  }
  else {
    for (j=6; j; j--) {
      lc_m1 = vis_ld64(inData - 8);
      lc  = vis_ld64(inData); inData += iw;

      lcc = vis_fpmerge_hi(lc,lc);
      lccl = vis_faligndata(lc_m1,lcc);
      vis_alignaddr_const(1);
      lccr = vis_faligndata_lo2(lcc,lc);
      vis_alignaddr_const(7);

      l1a = vis_fmul8x16_hi(lccl,fac_1313);
      l1b = vis_fmul8x16_hi(lccr,fac_3131);
      l1c = vis_fmul8x16_lo(lccl,fac_1313);
      l1d = vis_fmul8x16_lo(lccr,fac_3131);
      lcab3h = vis_fpadd16(l1a, l1b);
      lcab3l = vis_fpadd16(l1c, l1d);
      vis_st64(lcab3h, out1);
      vis_st64(lcab3l, out1+8); out1 += 16;
    }
  }

  /* Vertical upsampling */

  inData = (Byte*)savemem1;
  out2 = (vis_d64*) (oi + (4*iw*sy) + (2*sx));

  l0h = vis_ld64(inData);
  l0l = vis_ld64(inData+8);
  l1h = vis_ld64(inData+16);
  l1l = vis_ld64(inData+16+8);
  l2h = vis_ld64(inData+32);
  l2l = vis_ld64(inData+32+8);
  l3h = vis_ld64(inData+48);
  l3l = vis_ld64(inData+48+8);
  l4h = vis_ld64(inData+64);
  l4l = vis_ld64(inData+64+8);
  l5h = vis_ld64(inData+80);
  l5l = vis_ld64(inData+80+8);

  l1_3h = vis_fpadd16(vis_fpadd16(round,l1h), vis_fpadd16(l1h,l1h));
  l1_3l = vis_fpadd16(vis_fpadd16(round,l1l), vis_fpadd16(l1l,l1l));
  l2_3h = vis_fpadd16(vis_fpadd16(round,l2h), vis_fpadd16(l2h,l2h));
  l2_3l = vis_fpadd16(vis_fpadd16(round,l2l), vis_fpadd16(l2l,l2l));
  l3_3h = vis_fpadd16(vis_fpadd16(round,l3h), vis_fpadd16(l3h,l3h));
  l3_3l = vis_fpadd16(vis_fpadd16(round,l3l), vis_fpadd16(l3l,l3l));
  l4_3h = vis_fpadd16(vis_fpadd16(round,l4h), vis_fpadd16(l4h,l4h));
  l4_3l = vis_fpadd16(vis_fpadd16(round,l4l), vis_fpadd16(l4l,l4l));

  *out2 = vis_fpack16_2(vis_fpadd16(l0h,l1_3h),vis_fpadd16(l0l,l1_3l)); out2 += iw/4;
  *out2 = vis_fpack16_2(vis_fpadd16(l1_3h,l2h),vis_fpadd16(l1_3l,l2l)); out2 += iw/4;
  *out2 = vis_fpack16_2(vis_fpadd16(l1h,l2_3h),vis_fpadd16(l1l,l2_3l)); out2 += iw/4;
  *out2 = vis_fpack16_2(vis_fpadd16(l2_3h,l3h),vis_fpadd16(l2_3l,l3l)); out2 += iw/4;
  *out2 = vis_fpack16_2(vis_fpadd16(l2h,l3_3h),vis_fpadd16(l2l,l3_3l)); out2 += iw/4;
  *out2 = vis_fpack16_2(vis_fpadd16(l3_3h,l4h),vis_fpadd16(l3_3l,l4l)); out2 += iw/4;
  *out2 = vis_fpack16_2(vis_fpadd16(l3h,l4_3h),vis_fpadd16(l3l,l4_3l)); out2 += iw/4;
  *out2 = vis_fpack16_2(vis_fpadd16(l4_3h,l5h),vis_fpadd16(l4_3l,l5l));
}
/* ------------------------------------------------------------------------ */
/*   
 *   Interpolate 8x8 block to 16x16 using Sparc-V9 VIS extensions
 *
 *   2-pass version (slightly faster than 1-pass version)
 */

static void InterInnerBlock_8x8_2pass_vis (ii, oi, sx, sy, iw)
Byte *ii,*oi;
int sx,sy, iw;
{
  Byte *inData, *out1;
  vis_d64 *out2;
  vis_d64 fac_1313 = vis_to_double_dup(0x01000300),
          fac_3131 = vis_to_double_dup(0x03000100),
          round    = vis_to_double_quad(8);
  vis_d64 lc_m1,lc,lc_p1,lcch,lccl,lcclh,lccll,lccrh,lccrl,l1a,l1b,l1c,l1d,
          lcab3hh,lcab3hl,lcab3lh,lcab3ll,
	  l0h,l1h,l2h,l3h,l4h,l5h,l6h,l7h,l8h,l9h,
	  l0l,l1l,l2l,l3l,l4l,l5l,l6l,l7l,l8l,l9l,
	  l1_3h,l2_3h,l3_3h,l4_3h,l5_3h,l6_3h,l7_3h,l8_3h,
	  l1_3l,l2_3l,l3_3l,l4_3l,l5_3l,l6_3l,l7_3l,l8_3l;
  int i,j;

  /* Horizontal upsampling */

  vis_write_gsr(((7-4)<<3) | 7);    /* divide by 16, shift by 7 */
  inData  = ii + ((sy-1)*iw) + sx;
  out1 = (Byte*)savemem1;

  for (j=10; j; j--) {
    lc_m1 = vis_ld64(inData - 8);
    lc  = vis_ld64(inData);
    lc_p1 = vis_ld64(inData + 8); inData += iw;

    lcch = vis_fpmerge_hi(lc,lc);
    lccl = vis_fpmerge_lo(lc,lc);
    lcclh = vis_faligndata(lc_m1,lcch);
    lccll = vis_faligndata(lcch,lccl);
    vis_alignaddr_const(1);
    lccrh = vis_faligndata(lcch,lccl);
    lccrl = vis_faligndata(lccl,lc_p1);
    vis_alignaddr_const(7);

    l1a = vis_fmul8x16_hi(lcclh,fac_1313);
    l1b = vis_fmul8x16_hi(lccrh,fac_3131);
    l1c = vis_fmul8x16_lo(lcclh,fac_1313);
    l1d = vis_fmul8x16_lo(lccrh,fac_3131);
    lcab3hh = vis_fpadd16(l1a, l1b);
    lcab3hl = vis_fpadd16(l1c, l1d);
    l1a = vis_fmul8x16_hi(lccll,fac_1313);
    l1b = vis_fmul8x16_hi(lccrl,fac_3131);
    l1c = vis_fmul8x16_lo(lccll,fac_1313);
    l1d = vis_fmul8x16_lo(lccrl,fac_3131);
    lcab3lh = vis_fpadd16(l1a, l1b);
    lcab3ll = vis_fpadd16(l1c, l1d);
    vis_st64(lcab3hh, out1);
    vis_st64(lcab3hl, out1+8);
    vis_st64(lcab3lh, out1+16);
    vis_st64(lcab3ll, out1+24); out1 += 32;
  }

  /* Vertical upsampling */

  inData = (Byte*)savemem1;
  out2 = (vis_d64*) (oi + (4*iw*sy) + (2*sx));

  for (i=2; i; i--) {
    l0h = vis_ld64(inData);
    l0l = vis_ld64(inData+8);
    l1h = vis_ld64(inData+32);
    l1l = vis_ld64(inData+32+8);
    l2h = vis_ld64(inData+64);
    l2l = vis_ld64(inData+64+8);
    l3h = vis_ld64(inData+96);
    l3l = vis_ld64(inData+96+8);
    l4h = vis_ld64(inData+128);
    l4l = vis_ld64(inData+128+8);
    l5h = vis_ld64(inData+160);
    l5l = vis_ld64(inData+160+8);

    l1_3h = vis_fpadd16(vis_fpadd16(round,l1h), vis_fpadd16(l1h,l1h));
    l1_3l = vis_fpadd16(vis_fpadd16(round,l1l), vis_fpadd16(l1l,l1l));
    l2_3h = vis_fpadd16(vis_fpadd16(round,l2h), vis_fpadd16(l2h,l2h));
    l2_3l = vis_fpadd16(vis_fpadd16(round,l2l), vis_fpadd16(l2l,l2l));
    l3_3h = vis_fpadd16(vis_fpadd16(round,l3h), vis_fpadd16(l3h,l3h));
    l3_3l = vis_fpadd16(vis_fpadd16(round,l3l), vis_fpadd16(l3l,l3l));
    l4_3h = vis_fpadd16(vis_fpadd16(round,l4h), vis_fpadd16(l4h,l4h));
    l4_3l = vis_fpadd16(vis_fpadd16(round,l4l), vis_fpadd16(l4l,l4l));

    *out2 = vis_fpack16_2(vis_fpadd16(l0h,l1_3h),vis_fpadd16(l0l,l1_3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l1_3h,l2h),vis_fpadd16(l1_3l,l2l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l1h,l2_3h),vis_fpadd16(l1l,l2_3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l2_3h,l3h),vis_fpadd16(l2_3l,l3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l2h,l3_3h),vis_fpadd16(l2l,l3_3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l3_3h,l4h),vis_fpadd16(l3_3l,l4l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l3h,l4_3h),vis_fpadd16(l3l,l4_3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l4_3h,l5h),vis_fpadd16(l4_3l,l5l)); out2 += iw/4;

    l6h = vis_ld64(inData+192);
    l6l = vis_ld64(inData+192+8);
    l7h = vis_ld64(inData+224);
    l7l = vis_ld64(inData+224+8);
    l8h = vis_ld64(inData+256);
    l8l = vis_ld64(inData+256+8);
    l9h = vis_ld64(inData+288);
    l9l = vis_ld64(inData+288+8); inData += 16;

    l5_3h = vis_fpadd16(vis_fpadd16(round,l5h), vis_fpadd16(l5h,l5h));
    l5_3l = vis_fpadd16(vis_fpadd16(round,l5l), vis_fpadd16(l5l,l5l));
    l6_3h = vis_fpadd16(vis_fpadd16(round,l6h), vis_fpadd16(l6h,l6h));
    l6_3l = vis_fpadd16(vis_fpadd16(round,l6l), vis_fpadd16(l6l,l6l));
    l7_3h = vis_fpadd16(vis_fpadd16(round,l7h), vis_fpadd16(l7h,l7h));
    l7_3l = vis_fpadd16(vis_fpadd16(round,l7l), vis_fpadd16(l7l,l7l));
    l8_3h = vis_fpadd16(vis_fpadd16(round,l8h), vis_fpadd16(l8h,l8h));
    l8_3l = vis_fpadd16(vis_fpadd16(round,l8l), vis_fpadd16(l8l,l8l));

    *out2 = vis_fpack16_2(vis_fpadd16(l4h,l5_3h),vis_fpadd16(l4l,l5_3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l5_3h,l6h),vis_fpadd16(l5_3l,l6l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l5h,l6_3h),vis_fpadd16(l5l,l6_3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l6_3h,l7h),vis_fpadd16(l6_3l,l7l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l6h,l7_3h),vis_fpadd16(l6l,l7_3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l7_3h,l8h),vis_fpadd16(l7_3l,l8l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l7h,l8_3h),vis_fpadd16(l7l,l8_3l)); out2 += iw/4;
    *out2 = vis_fpack16_2(vis_fpadd16(l8_3h,l9h),vis_fpadd16(l8_3l,l9l)); out2 += iw/4;
    out2 = (vis_d64*)((long)out2 - iw*16*2 + 8);
  }
}
#else /* BLOCK_1PASS_VIS */

/* ------------------------------------------------------------------------ */
/*
 *   Interpolate a 4x4 block to 8x8 using Sparc-V9 VIS extensions
 *
 *   1-pass version (identical in speed to 2-pass version)
 */

static void InterInnerBlock_4x4_1pass_vis (ii, oi, sx, sy, iw)
Byte *ii,*oi;
int sx,sy, iw;
{
  Byte *inData, *outData;
  int ns1,ns2;
  int ow = iw<<1;
  int i,j;
  vis_d64 lp,lc,ln,l1,l2,l3,lpp,lcc,lnn,lppl,lccl,lnnl,lppr,lccr,lnnr,
          l1a,l1b,l1c,l1d,l11,l12,l21,l22,
          lpab3h,lpab3l,lcab3h,lcab3l,lnab3h,lnab3l;
  vis_d64 fac_1313 = vis_to_double_dup(0x01000300),
          fac_3131 = vis_to_double_dup(0x03000100),
          fac_3939 = vis_to_double_dup(0x03000900),
          fac_9393 = vis_to_double_dup(0x09000300),
          round    = vis_to_double_quad(8);
  int const_1 = 1,  /* due to compiler optimization failure */
      const_7 = 7;
  int off;

  vis_write_gsr((7-4)<<3);    /* divide by 16 */
  vis_alignaddr_const(const_7);

  /* out is always 8-byte aligned */
  outData = (Byte *)(((long)oi + (ow*2*sy) + (2*sx)) & ~0x7);
  /* in is only 4-byte aligned, depending on x-offset */
  inData  = (Byte *)(((long)ii + (sy*iw) + sx) & ~0x7);
  off = sx & 4;

  /*  Wether 4 source byte are in high or low part of 64-bit load */
  if(off) {
    /* precompute values pulled out of loop */
    lp = vis_ld64((void *)(inData - iw));
    lc = vis_ld64((void *)inData);
    l1 = vis_ld64(inData - iw + 8);
    l2 = vis_ld64(inData + 8);
    inData += iw;

    lpp = vis_fpmerge_lo(lp,lp);
    lcc = vis_fpmerge_lo(lc,lc);
    lppl = vis_faligndata_hi1(lp,lpp);
    lccl = vis_faligndata_hi1(lc,lcc);
    vis_alignaddr_const(const_1);
    lppr = vis_faligndata(lpp,l1);
    lccr = vis_faligndata(lcc,l2);
    vis_alignaddr_const(const_7);

    l1a = vis_fmul8x16_hi(lppl,fac_1313);
    l1b = vis_fmul8x16_hi(lppr,fac_3131);
    l1c = vis_fmul8x16_lo(lppl,fac_1313);
    l1d = vis_fmul8x16_lo(lppr,fac_3131);
    lpab3h = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
    lpab3l = vis_fpadd16(round, vis_fpadd16(l1c, l1d));
    l1a = vis_fmul8x16_hi(lccl,fac_1313);
    l1b = vis_fmul8x16_hi(lccr,fac_3131);
    l1c = vis_fmul8x16_lo(lccl,fac_1313);
    l1d = vis_fmul8x16_lo(lccr,fac_3131);
    lcab3h = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
    lcab3l = vis_fpadd16(round, vis_fpadd16(l1c, l1d));

    for (j=4; j; j--) {
      /* one line above plus the first two */
      ln = vis_ld64((void *)inData);

      /* one byte to the right of source block */
      l3 = vis_ld64(inData + 8);
      inData += iw;

      /* duplicate each pixel */
      lnn = vis_fpmerge_lo(ln,ln);

      /* extend to 16 bit and multiply as needed for weights */
      l1a = vis_fmul8x16_hi(lccl,fac_3939);
      l1c = vis_fmul8x16_lo(lccl,fac_3939);

      /* load in register of one line higher to avoid move later */
      /* shift each of the above one right and one left (gaining two results) */
      lccl = vis_faligndata_hi1(ln,lnn);
      vis_alignaddr_const(const_1);

      l1b = vis_fmul8x16_hi(lccr,fac_9393);
      l1d = vis_fmul8x16_lo(lccr,fac_9393);
      lccr = vis_faligndata(lnn,l3);
      vis_alignaddr_const(const_7);

      /* add up and round */
      l11 = vis_fpadd16(lpab3h, vis_fpadd16(l1a, l1b));
      l12 = vis_fpadd16(lpab3l, vis_fpadd16(l1c, l1d));
      lpab3h = lcab3h; lpab3l = lcab3l;

      lcab3h = vis_fpadd16(round,vis_fpadd16(vis_fmul8x16_hi(lccl,fac_1313),
					     vis_fmul8x16_hi(lccr,fac_3131)));
      /* C and D factors are equal to the last line's */
      l21 = vis_fpadd16(lcab3h,vis_fpadd16(l1a, l1b));

      /* same for next 4 pixels */
      vis_st64_pack2(l11, l12, outData);

      lcab3l = vis_fpadd16(round, vis_fpadd16(vis_fmul8x16_lo(lccl,fac_1313),
					      vis_fmul8x16_lo(lccr,fac_3131)));
      l22 = vis_fpadd16(lcab3l, vis_fpadd16(l1c, l1d));
      vis_st64_pack2(l21, l22, outData+ow);

      outData += ow*2;
    }
  }
  else {
    lp = vis_ld64((void *)(inData - iw));
    lc  = vis_ld64((void *)inData);
    l1 = vis_ld64((void *)(inData - iw - 8));
    l2 = vis_ld64((void *)(inData - 8));
    inData += iw;

    lpp = vis_fpmerge_hi(lp,lp);
    lcc = vis_fpmerge_hi(lc,lc);
    vis_alignaddr_const(const_7);
    lppl = vis_faligndata(l1,lpp);
    lccl = vis_faligndata(l2,lcc);
    vis_alignaddr_const(const_1);
    lppr = vis_faligndata_lo2(lpp,lp);
    lccr = vis_faligndata_lo2(lcc,lc);
    vis_alignaddr_const(const_7);

    l1a = vis_fmul8x16_hi(lppl,fac_1313);
    l1b = vis_fmul8x16_hi(lppr,fac_3131);
    lpab3h = vis_fpadd16(round, vis_fpadd16(l1a,l1b));
    l1a = vis_fmul8x16_lo(lppl,fac_1313);
    l1b = vis_fmul8x16_lo(lppr,fac_3131);
    lpab3l = vis_fpadd16(round, vis_fpadd16(l1a,l1b));

    l1a = vis_fmul8x16_hi(lccl,fac_1313);
    l1b = vis_fmul8x16_hi(lccr,fac_3131);
    lcab3h = vis_fpadd16(round, vis_fpadd16(l1a,l1b));
    l1a = vis_fmul8x16_lo(lccl,fac_1313);
    l1b = vis_fmul8x16_lo(lccr,fac_3131);
    lcab3l = vis_fpadd16(round, vis_fpadd16(l1a,l1b));

    for (j=4; j; j--) {
      /* one line above plus the first two */
      ln = vis_ld64((void *)inData);

      /* one byte to the left of source block */
      l3 = vis_ld64((void *)(inData - 8));

      /* duplicate each pixel */
      lnn = vis_fpmerge_hi(ln,ln);

      /* shift each of the above one right and one left (gaining two results) */
      lnnl = vis_faligndata(l3,lnn);
      vis_alignaddr_const(const_1);
      lnnr = vis_faligndata_lo2(lnn,ln);
      vis_alignaddr_const(const_7);

      /* extend to 16 bit and multiply as needed for weights */
      l1c = vis_fmul8x16_hi(lccl,fac_3939);
      l1d = vis_fmul8x16_hi(lccr,fac_9393);

      /* add up and round */
      l11 = vis_fpadd16(lpab3h, vis_fpadd16(l1c, l1d));

      l1a = vis_fmul8x16_hi(lnnl,fac_1313);
      l1b = vis_fmul8x16_hi(lnnr,fac_3131);
      lnab3h = vis_fpadd16(round, vis_fpadd16(l1a,l1b));
      /* C and D factors are equal to the last line's */
      l21 = vis_fpadd16(lnab3h, vis_fpadd16(l1c, l1d));

      /* same for next 4 pixels */
      l1c = vis_fmul8x16_lo(lccl,fac_3939);
      l1d = vis_fmul8x16_lo(lccr,fac_9393);
      l12 = vis_fpadd16(lpab3l, vis_fpadd16(l1c, l1d));
      /* pack and store away */
      vis_st64_pack2(l11, l12, outData);

      l1a = vis_fmul8x16_lo(lnnl,fac_1313);
      l1b = vis_fmul8x16_lo(lnnr,fac_3131);
      lnab3l = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
      l22 = vis_fpadd16(lnab3l, vis_fpadd16(l1c, l1d));
      vis_st64_pack2(l21, l22, outData+ow);

      lccl = lnnl; lccr = lnnr;
      lpab3h = lcab3h; lpab3l = lcab3l;
      lcab3h = lnab3h; lcab3l = lnab3l;
      outData += ow*2;
      inData += iw;
    }
  }
}

/* ------------------------------------------------------------------------ */
/*   
 *   Interpolate a 8x8 block to 16x16 using Sparc-V9 VIS extensions
 *
 *   1-pass version (slightly slower than 2-pass version)
 */

static void InterInnerBlock_8x8_1pass_vis (ii, oi, sx, sy, iw)
Byte *ii,*oi;
int sx,sy, iw;
{
  Byte *inData, *outData;
  int ns1,ns2;
  int ow = iw<<1;
  int i,j;
  vis_d64 lpph,lppl,lcch,lccl,lnnh,lnnl,lpplh,lppll,lcclh,lccll,lnnlh,lnnll,
	  lpprh,lpprl,lccrh,lccrl,lnnrh,lnnrl,lp_p1,lp,lp_m1,lc_p1,lc,lc_m1,
	  ln_m1,ln,ln_p1, l1a,l1b,l1c,l1d,l11,l12,l21,l22,
	  lpab3hh, lpab3hl, lcab3hh, lcab3hl, lpab3lh, lpab3ll,
	  lcab3lh, lcab3ll, lnab3hh, lnab3hl, lnab3lh, lnab3ll;

  vis_d64 fac_1313 = vis_to_double_dup(0x01000300),
          fac_3131 = vis_to_double_dup(0x03000100),
          fac_3939 = vis_to_double_dup(0x03000900),
          fac_9393 = vis_to_double_dup(0x09000300),
          round    = vis_to_double_quad(8);
  int const_1 = 1,  /* due to compiler optimization failure */
      const_7 = 7;

  vis_write_gsr((7-4)<<3);    /* divide by 16 */
  vis_alignaddr_const(const_7);

  outData = oi + (ow*2*sy) + (2*sx);
  inData  = ii + (sy*iw) + sx;

  lp_m1 = vis_ld64((void *)(inData - iw - 8));
  lp = vis_ld64((void *)(inData - iw));
  lp_p1 = vis_ld64((void *)(inData - iw + 8));
  lc_m1 = vis_ld64((void *)(inData - 8));
  lc  = vis_ld64((void *)inData);
  lc_p1 = vis_ld64((void *)(inData + 8));
  inData += iw;

  lpph = vis_fpmerge_hi(lp,lp);
  lcch = vis_fpmerge_hi(lc,lc);
  lppl = vis_fpmerge_lo(lp,lp);
  lccl = vis_fpmerge_lo(lc,lc);
  lpplh = vis_faligndata(lp_m1,lpph);
  lcclh = vis_faligndata(lc_m1,lcch);
  lppll = vis_faligndata(lpph,lppl);
  lccll = vis_faligndata(lcch,lccl);
  vis_alignaddr_const(const_1);
  lpprh = vis_faligndata(lpph,lppl);
  lccrh = vis_faligndata(lcch,lccl);
  lpprl = vis_faligndata(lppl,lp_p1);
  lccrl = vis_faligndata(lccl,lc_p1);
  vis_alignaddr_const(const_7);

  l1a = vis_fmul8x16_hi(lpplh,fac_1313);
  l1b = vis_fmul8x16_hi(lpprh,fac_3131);
  l1c = vis_fmul8x16_lo(lpplh,fac_1313);
  l1d = vis_fmul8x16_lo(lpprh,fac_3131);
  lpab3hh = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
  lpab3hl = vis_fpadd16(round, vis_fpadd16(l1c, l1d));
  l1a = vis_fmul8x16_hi(lcclh,fac_1313);
  l1b = vis_fmul8x16_hi(lccrh,fac_3131);
  l1c = vis_fmul8x16_lo(lcclh,fac_1313);
  l1d = vis_fmul8x16_lo(lccrh,fac_3131);
  lcab3hh = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
  lcab3hl = vis_fpadd16(round, vis_fpadd16(l1c, l1d));

  l1a = vis_fmul8x16_hi(lppll,fac_1313);
  l1b = vis_fmul8x16_hi(lpprl,fac_3131);
  l1c = vis_fmul8x16_lo(lppll,fac_1313);
  l1d = vis_fmul8x16_lo(lpprl,fac_3131);
  lpab3lh = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
  lpab3ll = vis_fpadd16(round, vis_fpadd16(l1c, l1d));
  l1a = vis_fmul8x16_hi(lccll,fac_1313);
  l1b = vis_fmul8x16_hi(lccrl,fac_3131);
  l1c = vis_fmul8x16_lo(lccll,fac_1313);
  l1d = vis_fmul8x16_lo(lccrl,fac_3131);
  lcab3lh = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
  lcab3ll = vis_fpadd16(round, vis_fpadd16(l1c, l1d));

  for (j=8; j; j--) {
    /* one line below the first two */
    ln_m1 = vis_ld64((void *)(inData - 8));
    ln    = vis_ld64((void *)inData);
    ln_p1 = vis_ld64((void *)(inData + 8));

    /* duplicate each pixel */
    lnnh = vis_fpmerge_hi(ln,ln);
    lnnl = vis_fpmerge_lo(ln,ln);

    /* shift the line twice: once a byte left and once one right */
    lnnlh = vis_faligndata(ln_m1,lnnh);
    lnnll = vis_faligndata(lnnh,lnnl);
    vis_alignaddr_const(const_1);
    lnnrh = vis_faligndata(lnnh,lnnl);
    lnnrl = vis_faligndata(lnnl,ln_p1);
    vis_alignaddr_const(const_7);

    l1c = vis_fmul8x16_hi(lcclh,fac_3939);
    l1d = vis_fmul8x16_hi(lccrh,fac_9393);
    l11 = vis_fpadd16(lpab3hh, vis_fpadd16(l1c, l1d));
    l1a = vis_fmul8x16_hi(lnnlh,fac_1313);
    l1b = vis_fmul8x16_hi(lnnrh,fac_3131);
    lnab3hh = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
    l21 = vis_fpadd16(lnab3hh, vis_fpadd16(l1c, l1d));

    l1c = vis_fmul8x16_lo(lcclh,fac_3939);
    l1d = vis_fmul8x16_lo(lccrh,fac_9393);
    l12 = vis_fpadd16(lpab3hl, vis_fpadd16(l1c, l1d));
    l1a = vis_fmul8x16_lo(lnnlh,fac_1313);
    l1b = vis_fmul8x16_lo(lnnrh,fac_3131);
    lnab3hl = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
    l22 = vis_fpadd16(lnab3hl, vis_fpadd16(l1c, l1d));
    vis_st64_pack2(l11, l12, outData);
    vis_st64_pack2(l21, l22, outData+ow);
    lcclh = lnnlh; lccrh = lnnrh;
    lpab3hh=lcab3hh; lpab3hl=lcab3hl;
    lcab3hh=lnab3hh; lcab3hl=lnab3hl;

    /* same for next 4 pixels */
    l1c = vis_fmul8x16_hi(lccll,fac_3939);
    l1d = vis_fmul8x16_hi(lccrl,fac_9393);
    l11 = vis_fpadd16(lpab3lh, vis_fpadd16(l1c, l1d));
    l1a = vis_fmul8x16_hi(lnnll,fac_1313);
    l1b = vis_fmul8x16_hi(lnnrl,fac_3131);
    lnab3lh = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
    l21 = vis_fpadd16(lnab3lh, vis_fpadd16(l1c, l1d));

    l1c = vis_fmul8x16_lo(lccll,fac_3939);
    l1d = vis_fmul8x16_lo(lccrl,fac_9393);
    l12 = vis_fpadd16(lpab3ll, vis_fpadd16(l1c, l1d));
    l1a = vis_fmul8x16_lo(lnnll,fac_1313);
    l1b = vis_fmul8x16_lo(lnnrl,fac_3131);
    lnab3ll = vis_fpadd16(round, vis_fpadd16(l1a, l1b));
    l22 = vis_fpadd16(lnab3ll, vis_fpadd16(l1c, l1d));
    vis_st64_pack2(l11, l12, outData + 8);
    vis_st64_pack2(l21, l22, outData+ow+8);
    lccll = lnnll; lccrl = lnnrl;
    lpab3lh=lcab3lh; lpab3ll=lcab3ll;
    lcab3lh=lnab3lh; lcab3ll=lnab3ll;

    outData += ow*2;
    inData += iw;
  }
}
#endif /* BLOCK_1PASS_VIS */
#endif /* !WITHOUT_VIS */
#endif /* not CLASSIC */

/* ------------------------------------------------------------------------ */

void InterpolateImage (ImagePtr in, ImagePtr out) {

#ifdef CLASSIC
  InterpolateHV_orig(in->data, out->data, in->w, in->h);
#else
# ifndef IFRAME_1PASS
  InterpolateHV_2pass(in->data, out->data, in->w, in->h);
# else
  InterpolateHV_1pass(in->data, out->data, in->w, in->h);
# endif
#endif
}
/* ------------------------------------------------------------------------ */

void InterpolateFrame (FramePtr in, FramePtr out) {
  InterpolateImage(in->Y, out->Y);
  InterpolateImage(in->U, out->U);
  InterpolateImage(in->V, out->V);
}

/* ------------------------------------------------------------------------ */

void InterpolateBlockHV (Global *this, Byte *ii, Byte *oi, int sx, int sy, int bw, int bh, int iw, int ih) {

  if ((sx>0)&&(sy>0)&&(sx<(iw-bw))&&(sy<(ih-bh))) {
#ifdef CLASSIC
    InterInnerBlock_orig(ii, oi, sx, sy, bw, bh, iw);
#else
    if(bw == 4)  {
# ifndef WITHOUT_VIS
      if (this->Options.use_vis)
#  ifdef BLOCK_1PASS_VIS
	InterInnerBlock_4x4_1pass_vis(ii, oi, sx, sy, iw);
#  else
	InterInnerBlock_4x4_2pass_vis(ii, oi, sx, sy, iw);
#  endif
      else
# endif /* !WITHOUT_VIS */
# ifdef BLOCK_1PASS
        InterInnerBlock_4x4_1pass(ii, oi, sx, sy, iw);
# else
        InterInnerBlock_4x4_2pass(ii, oi, sx, sy, iw);
# endif
    } else {
# ifndef WITHOUT_VIS
      if (this->Options.use_vis)
#  ifdef BLOCK_1PASS_VIS
        InterInnerBlock_8x8_1pass_vis(ii, oi, sx, sy, iw);
#  else
        InterInnerBlock_8x8_2pass_vis(ii, oi, sx, sy, iw);
#  endif
      else
# endif /* !WITHOUT_VIS */
# ifdef BLOCK_1PASS
      InterInnerBlock_8x8_1pass(ii, oi, sx, sy, iw);
# else
      InterInnerBlock_8x8_2pass(ii, oi, sx, sy, iw);
# endif
    }
#endif
  }
  else {
    InterBorderBlock(ii, oi, sx, sy, bw, bh, iw, ih);
  }
}
