/*
 *  Halfpelmotion.c
 *
 *  This module contains motion compensation routines with halfpel accuracy.
 *  Halfpel-compensation is accomplished by horizontal or vertical inter-
 *  polation respectively. There are 2 procedures for 16x16 and 8x8 sized
 *  blocks, each with 4 cases for all possible combinations of hor./vert.
 *  interpolation.
 *
 *  Alignment: For speed load/store transfers are done with 32/64 bit
 *             whenever possible. The target block is 8-Byte-aligned,
 *             however due to motion the source block is not. The mis-
 *             alignment is handled in if or case constructs, switching
 *             over the offset from the nearest lower 8-Byte-boundary.
 *
 *             For efficiency the routines assume that the window width
 *             is a factor of 8 and that at least 7 bytes can be read
 *             (w/o SEGV) outside the 8x8 or 16x16 window borders.
 *
 *  See the CLASSIC routines for an easily readable description of the
 *  algorithm. The integer-version is sped up by binary arithmetic, i.e.
 *  2 bytes are added in a 32-Bit word in parallel. VIS versions do
 *  4 Pixel adds in parallel.
 *
 *  $Id: HalfPelMotion.c,v 1.1.1.1 1997/12/16 13:14:40 jnweiger Exp $
 */

#include <string.h>

#include "Util.h"
#include "Codebook.h"

#include "global.h"

#include "Input.h"
#include "DecodeSymbol.h"

extern int Yrowbytes;                  /* (from BlockDecoder.c) */
extern int Crowbytes;

#include "vis.h"

#undef CLASSIC  /* original, barely optimized (hence readable) implementation */
#undef SLOW     /* alternative optimization possibility, prooved slower */

#ifndef WITHOUT_VIS

/* ========================================================================= *
 * =====                      VIS  IMPLEMENTATION                       ==== *
 * ========================================================================= */

/*
 *  Perform motion compensation with VIS. The most important issues are:
 *  - misalignment is handled more elegantely as in integer by the
 *    alignaddr/faligndata instruction pair. They require only one
 *    additional load per scanline and one faligndata per 4 bytes.
 *  - 4 adds can be performed in parallel, but byte-operands have to
 *    be expanded to 16 bit explicitely before use.
 *  - pipelines can be used more efficientely by using fmul8x16 instead
 *    of expand, because the latter is executed in the same pipeline
 *    as add and faligndata. For small 8x8 blocks however this rises
 *    to many FP stalls, so the trick isn't used there.
 *  - division by 2 (for interpolation) is performed with pack, which
 *    also transforms back from 16 bit to 8 bit operands.
 *  - Inner loops (horizontal) are unrolled 100%.
 *  - Further speedup is achieved with vertical interpolation by unrolling
 *    the loop once. The loaded and expanded operands of the last scanline
 *    are kept in registers between passes of the loop, reducing the
 *    overhead for load and expand.
 */

static void
CompensateHalfPelMotion16_vis(dst,src,xh,yh,rb)
Byte *src,*dst;
int xh,yh,rb;
{
  Byte *src_a;               /* 8-Byte aligned source pointer */
  int i;                     /* loop counter */
  vis_d64 round,scale,       /* added for rounding */
          null = vis_fzero();

  switch ((xh<<1)|yh) {
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 0:

    /* ================  COPY 16x16 ================== */
 

    src_a = vis_alignaddr(src, dst - dst);

    if(!((long)src & 7)) {
      /*
       *  Copy 16x16 block 1:1 (src 8-byte aligned).
       *  This is not faster then the VIS-less counterpart.
       */
      vis_d64 v11,v12,v21,v22;

      for(i=8;i;i--) {
	v11 = vis_ld64((void *)src_a);
	v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v21 = vis_ld64((void *)src_a);
	v22 = vis_ld64((void *)(src_a+8)); src_a += rb;
	vis_st64(v11, dst);
	vis_st64(v12, dst+8); dst += rb;
	vis_st64(v21, dst);
	vis_st64(v22, dst+8); dst += rb;
      }
    }
    else {
      vis_d64 v11,v12,v13,v21,v22,v23,v1l,v1r,v2l,v2r;

      for(i=8;i;i--) {
	v11 = vis_ld64((void *)src_a);
	v12 = vis_ld64((void *)(src_a+8));
	v13 = vis_ld64((void *)(src_a + 16)); src_a += rb;
	v21 = vis_ld64((void *)src_a);
	v22 = vis_ld64((void *)(src_a+8));
	v23 = vis_ld64((void *)(src_a + 16)); src_a += rb;
	v1l = vis_faligndata(v11,v12);
	v1r = vis_faligndata(v12,v13);
	v2l = vis_faligndata(v21,v22);
	v2r = vis_faligndata(v22,v23);
	vis_st64(v1l, dst);
	vis_st64(v1r, dst+8); dst += rb;
	vis_st64(v2l, dst);
	vis_st64(v2r, dst+8); dst += rb;
      }
    }
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
    break;
  case 1:

    /* ================  INTERPOLATE 16x16 VERTICAL ================== */


    round = vis_to_double_quad(1<<4);
    scale = vis_to_double_quad(1<<(4+8));
    vis_write_gsr((7-4-1)<<3); /* set fpack16-scale to "shift 4+1 right" */
			       /* (the 4 originates from fexpand) */

    src_a = vis_alignaddr(src, dst - dst);

    if(!((long)src & 7)) {         /* source spans only 2 quadwords */
      vis_d64 v11,v12,v21,v22,v11l,v11h,v12l,v12h,v21l,v21h,v22l,v22h,
	      vr11h,vr11l,vr12h,vr12l,vr21h,vr21l,vr22h,vr22l,
	      vr11,vr12,vr21,vr22;

      v11 = vis_ld64((void *)src_a);
      v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v11h = vis_fexpand(vis_read_hi(v11));
      v11l = vis_fexpand(vis_read_lo(v11));
      v12h = vis_fexpand(vis_read_hi(v12));
      v12l = vis_fexpand(vis_read_lo(v12));

      for(i=8;i;i--) {
        v21 = vis_ld64((void *)src_a);
        v22 = vis_ld64((void *)(src_a+8)); src_a += rb;
        v11 = vis_ld64((void *)src_a);
        v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
        v21h = vis_fexpand(vis_read_hi(v21));
        v21l = vis_fexpand(vis_read_lo(v21));
        v22h = vis_fmul8x16(vis_read_hi(v22),scale);
        v22l = vis_fmul8x16(vis_read_lo(v22),scale);
        v21h = vis_fpadd16(v21h,round);
        v21l = vis_fpadd16(v21l,round);
        v22h = vis_fpadd16(v22h,round);
        v22l = vis_fpadd16(v22l,round);
        vr11 = vis_freg_pair(
                vis_fpack16(vis_fpadd16(v11h,v21h)),
                vis_fpack16(vis_fpadd16(v11l,v21l)));
        vr12 = vis_freg_pair(
                vis_fpack16(vis_fpadd16(v12h,v22h)),
                vis_fpack16(vis_fpadd16(v12l,v22l)));
        v11h = vis_fmul8x16(vis_read_hi(v11),scale);
        v11l = vis_fmul8x16(vis_read_lo(v11),scale);
        v12h = vis_fexpand(vis_read_hi(v12));
        v12l = vis_fexpand(vis_read_lo(v12));
        vr21 = vis_freg_pair(
                vis_fpack16(vis_fpadd16(v21h,v11h)),
                vis_fpack16(vis_fpadd16(v21l,v11l)));
        vr22 = vis_freg_pair(
                vis_fpack16(vis_fpadd16(v22h,v12h)),
                vis_fpack16(vis_fpadd16(v22l,v12l)));
        vis_st64(vr11, dst);
        vis_st64(vr12, dst+8); dst += rb;
        vis_st64(vr21, dst);
        vis_st64(vr22, dst+8); dst += rb;
      }
    }
    else {
      vis_d64 v11,v12,v13,v21,v22,v23,v1l,v1r,v2l,v2r,
	      v11h,v11l,v12h,v12l,v21h,v21l,v22h,v22l,vr11,vr12,vr21,vr22;

      v11 = vis_ld64((void *)src_a);
      v12 = vis_ld64((void *)(src_a+8));
      v13 = vis_ld64((void *)(src_a + 16)); src_a += rb;
      v1l = vis_faligndata(v11,v12);
      v1r = vis_faligndata(v12,v13);
      v11h = vis_fexpand(vis_read_hi(v1l));
      v11l = vis_fexpand(vis_read_lo(v1l));
      v12h = vis_fexpand(vis_read_hi(v1r));
      v12l = vis_fexpand(vis_read_lo(v1r));

      for(i=8;i;i--) {
	v21 = vis_ld64((void *)src_a);
	v22 = vis_ld64((void *)(src_a+8));
	v23 = vis_ld64((void *)(src_a + 16)); src_a += rb;
	v11 = vis_ld64((void *)src_a);
	v12 = vis_ld64((void *)(src_a+8));
	v13 = vis_ld64((void *)(src_a + 16)); src_a += rb;
	v2l = vis_faligndata(v21,v22);
	v2r = vis_faligndata(v22,v23);
	v1l = vis_faligndata(v11,v12);
	v1r = vis_faligndata(v12,v13);
	v21h = vis_fexpand(vis_read_hi(v2l));
	v21l = vis_fexpand(vis_read_lo(v2l));
	v22h = vis_fmul8x16(vis_read_hi(v2r),scale);
	v22l = vis_fmul8x16(vis_read_lo(v2r),scale);
	v21h = vis_fpadd16(v21h,round);
	v21l = vis_fpadd16(v21l,round);
	v22h = vis_fpadd16(v22h,round);
	v22l = vis_fpadd16(v22l,round);
	vr11 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v11h,v21h)),
		vis_fpack16(vis_fpadd16(v11l,v21l)));
	vr12 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v12h,v22h)),
		vis_fpack16(vis_fpadd16(v12l,v22l)));
	v11h = vis_fmul8x16(vis_read_hi(v1l),scale);
	v11l = vis_fexpand(vis_read_lo(v1l));
	v12h = vis_fexpand(vis_read_hi(v1r));
	v12l = vis_fexpand(vis_read_lo(v1r));
	vr21 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v21h,v11h)),
		vis_fpack16(vis_fpadd16(v21l,v11l)));
	vr22 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v22h,v12h)),
		vis_fpack16(vis_fpadd16(v22l,v12l)));
	vis_st64(vr11, dst);
	vis_st64(vr12, dst+8); dst += rb;
	vis_st64(vr21, dst);
	vis_st64(vr22, dst+8); dst += rb;
      }
    }
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
    break;
  case 2:

    /* ================  INTERPOLATE 16x16 HORIZONTAL ================== */


    round = vis_to_double_quad(1<<4);
    scale = vis_to_double_quad(1<<(4+8));
    vis_write_gsr((7-4-1)<<3); /* set fpack16-scale to "shift 4+1 right" */

    if(!((long)src & 7)) {         /* source spans only 2 quadwords */
      vis_d64 v11,v12,v13,v1a,v1b,v11h,v11l,v12h,v12l,v1ah,v1al,v1bh,v1bl,
	      vr1,vr2;

      vis_alignaddr_const(1);

      for(i=16;i;i--) {
	v11 = vis_ld64((void *)src);
	v12 = vis_ld64((void *)(src+8));
	v13 = vis_ld64((void *)(src+16)); src += rb;
	v1a = vis_faligndata(v11,v12);
	v1b = vis_faligndata(v12,v13);

	/* fmul8x16 and fexpand are executed concurrently in the processor
	 * hence we replace some expands by an aequivalent multiplication
	 */
	v11h = vis_fmul8x16_hi(v11,scale);
	v11l = vis_fmul8x16_lo(v11,scale);
	v12h = vis_fmul8x16_hi(v12,scale);
	v12l = vis_fmul8x16_lo(v12,scale);
	v1ah = vis_fmul8x16_hi(v1a,scale);
	v1al = vis_fexpand(vis_read_lo(v1a));
	v1bh = vis_fexpand(vis_read_hi(v1b));
	v1bl = vis_fexpand(vis_read_lo(v1b));
	vr1 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v11h,vis_fpadd16(v1ah,round))),
		vis_fpack16(vis_fpadd16(v11l,vis_fpadd16(v1al,round))));
	vr2 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v12h,vis_fpadd16(v1bh,round))),
		vis_fpack16(vis_fpadd16(v12l,vis_fpadd16(v1bl,round))));
	vis_st64(vr1, dst);
	vis_st64(vr2, dst+8); dst += rb;
      }
    }
    else if(((int)src & 7) == 7) {
      vis_d64 v1a,v11l,v11r,v12l,v12r,v11lh,v11ll,v12lh,v12ll,
	      v11rh,v11rl,v12rh,v12rl,vr1l,vr1r;

      src_a = vis_alignaddr(src,0);

      for(i=16;i;i--) {
	v1a = vis_ld64((void *)src_a);
	v12l = vis_ld64((void *)(src_a+8));
	v12r = vis_ld64((void *)(src_a + 16)); src_a += rb;
	v11l = vis_faligndata(v1a,v12l);
	v11r = vis_faligndata(v12l,v12r);
	v11lh = vis_fpadd16(round,vis_fexpand_hi(v11l));
	v11ll = vis_fpadd16(round,vis_fexpand_lo(v11l));
	v11rh = vis_fpadd16(round,vis_fmul8x16_hi(v11r,scale));
	v11rl = vis_fpadd16(round,vis_fmul8x16_lo(v11r,scale));
	v12lh = vis_fmul8x16_hi(v12l,scale);
	v12ll = vis_fmul8x16_lo(v12l,scale);
	v12rh = vis_fmul8x16_hi(v12r,scale);
	v12rl = vis_fmul8x16_lo(v12r,scale);
	vr1l = vis_freg_pair(
	       vis_fpack16(vis_fpadd16(v11lh,v12lh)),
	       vis_fpack16(vis_fpadd16(v11ll,v12ll)));
	vr1r = vis_freg_pair(
	       vis_fpack16(vis_fpadd16(v11rh,v12rh)),
	       vis_fpack16(vis_fpadd16(v11rl,v12rl)));
	vis_st64(vr1l, dst);
	vis_st64(vr1r, dst+8); dst += rb;
      }
    }
    else {
      vis_d64 v1a,v1b,v1c,v11l,v11r,v12l,v12r,v11lh,v11ll,v12lh,v12ll,
	      v11rh,v11rl,v12rh,v12rl,vr1l,vr1r;
      int align_r;

      src_a = vis_alignaddr(src,0);
      align_r = ((int)src & 7) + 1;    /* no overflow possible */

      for(i=16;i;i--) {
	v1a = vis_ld64((void *)src_a);
	v1b = vis_ld64((void *)(src_a+8));
	v1c = vis_ld64((void *)(src_a + 16)); src_a += rb;
	v11l = vis_faligndata(v1a,v1b);
	v11r = vis_faligndata(v1b,v1c);
	vis_alignaddr_const(align_r);
	v11lh = vis_fpadd16(round,vis_fmul8x16_hi(v11l,scale));
	v11ll = vis_fpadd16(round,vis_fmul8x16_lo(v11l,scale));
	v11rh = vis_fpadd16(round,vis_fmul8x16_hi(v11r,scale));
	v11rl = vis_fpadd16(round,vis_fmul8x16_lo(v11r,scale));
	v12l = vis_faligndata(v1a,v1b);
	v12r = vis_faligndata(v1b,v1c);
	vis_alignaddr_const((int)src);
	v12lh = vis_fmul8x16_hi(v12l,scale);
	v12ll = vis_fmul8x16_lo(v12l,scale);
	v12rh = vis_fmul8x16_hi(v12r,scale);
	v12rl = vis_fmul8x16_lo(v12r,scale);
	vr1l = vis_freg_pair(
	       vis_fpack16(vis_fpadd16(v11lh,v12lh)),
	       vis_fpack16(vis_fpadd16(v11ll,v12ll)));
	vr1r = vis_freg_pair(
	       vis_fpack16(vis_fpadd16(v11rh,v12rh)),
	       vis_fpack16(vis_fpadd16(v11rl,v12rl)));
	vis_st64(vr1l, dst);
	vis_st64(vr1r, dst+8); dst += rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 3:

    /* ============  INTERPOLATE 16x16 HORIZONTAL & VERTICAL ============= */


    round = vis_to_double_quad(2<<4);   /* added for rounding of sum */
    scale = vis_to_double_quad(1<<(4+8));   /* to substitute fexpand */
    vis_write_gsr((7-4-2)<<3); /* set fpack16-scale to "shift 4+2 right" */
			       /* (the 4 originates from fexpand) */

    if(!((long)src & 7)) {
      /*
       *  interpolate 16x16 hor&vert: src 8-aligned
       */
      vis_d64 v11,v12,v13,v1a,v1b,v11sh,v11sl,v12sh,v12sl,vr11,vr12,
	      v21,v22,v23,v2a,v2b,v21sh,v21sl,v22sh,v22sl,vr21,vr22;

      vis_alignaddr_const(1);

      v11 = vis_ld64((void *)src);
      v12 = vis_ld64((void *)(src + 8));
      v13 = vis_ld64((void *)(src + 16)); src += rb;
      v1a = vis_faligndata(v11,v12);
      v1b = vis_faligndata(v12,v13);
      v11sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v11)),
			 vis_fexpand(vis_read_hi(v1a))));
      v11sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_lo(v11)),
			 vis_fexpand(vis_read_lo(v1a))));
      v12sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v12)),
			 vis_fexpand(vis_read_hi(v1b))));
      v12sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fmul8x16(vis_read_lo(v12),scale),
			 vis_fmul8x16(vis_read_lo(v1b),scale)));

      for(i=8;i;i--) {
	v21 = vis_ld64((void *)src);
	v22 = vis_ld64((void *)(src + 8));
	v23 = vis_ld64((void *)(src + 16)); src += rb;
	v11 = vis_ld64((void *)src);
	v12 = vis_ld64((void *)(src + 8));
	v13 = vis_ld64((void *)(src + 16)); src += rb;
	v2a = vis_faligndata(v21,v22);
	v2b = vis_faligndata(v22,v23);
	v1a = vis_faligndata(v11,v12);
	v1b = vis_faligndata(v12,v13);
	v21sh = vis_fpadd16(vis_fexpand(vis_read_hi(v21)),
			   vis_fmul8x16(vis_read_hi(v2a),scale));
	v21sl = vis_fpadd16(vis_fexpand(vis_read_lo(v21)),
			   vis_fexpand(vis_read_lo(v2a)));
	v22sh = vis_fpadd16(vis_fmul8x16(vis_read_hi(v22),scale),
			   vis_fmul8x16(vis_read_hi(v2b),scale));
	v22sl = vis_fpadd16(vis_fmul8x16(vis_read_lo(v22),scale),
			   vis_fmul8x16(vis_read_lo(v2b),scale));
	vr11 = vis_freg_pair(vis_fpack16(vis_fpadd16(v11sh,v21sh)),
			     vis_fpack16(vis_fpadd16(v11sl,v21sl)));
	vr12 = vis_freg_pair(vis_fpack16(vis_fpadd16(v12sh,v22sh)),
			     vis_fpack16(vis_fpadd16(v12sl,v22sl)));
	v11sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_hi(v11)),
			   vis_fexpand(vis_read_hi(v1a))));
	v11sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_lo(v11)),
			   vis_fexpand(vis_read_lo(v1a))));
	v12sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fmul8x16(vis_read_hi(v12),scale),
			   vis_fmul8x16(vis_read_hi(v1b),scale)));
	v12sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fmul8x16(vis_read_lo(v12),scale),
			   vis_fmul8x16(vis_read_lo(v1b),scale)));
	vr21 = vis_freg_pair(vis_fpack16(vis_fpadd16(v11sh,v21sh)),
			     vis_fpack16(vis_fpadd16(v11sl,v21sl)));
	vr22 = vis_freg_pair(vis_fpack16(vis_fpadd16(v12sh,v22sh)),
			     vis_fpack16(vis_fpadd16(v12sl,v22sl)));
	vis_st64(vr11, dst);
	vis_st64(vr12, dst + 8); dst += rb;
	vis_st64(vr21, dst);
	vis_st64(vr22, dst + 8); dst += rb;
      }
    }
    else if(((int)src & 7) == 7) {
      /*
       *  interpolate 16x16 hor&vert: src not aligned, but (src+1) 8-aligned
       */
      vis_d64 v11,v12,v11sh,v11sl,v12sh,v12sl,vr11,vr12,
	      v21,v22,v21sh,v21sl,v22sh,v22sl,vr21,vr22,
	      vi11,vi12,vi13,vi21,vi22,vi23;

      src_a = vis_alignaddr(src,0);

      vi11 = vis_ld64((void *)src_a);
      vi12 = vis_ld64((void *)(src_a+8));
      vi13 = vis_ld64((void *)(src_a+16)); src_a += rb;
      v11 = vis_faligndata(vi11,vi12);
      v12 = vis_faligndata(vi12,vi13);
      v11sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v11)),
			 vis_fexpand(vis_read_hi(vi12))));
      v11sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_lo(v11)),
			 vis_fexpand(vis_read_lo(vi12))));
      v12sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v12)),
			 vis_fexpand(vis_read_hi(vi13))));
      v12sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fmul8x16(vis_read_lo(v12),scale),
			 vis_fmul8x16(vis_read_lo(vi13),scale)));

      for(i=8;i;i--) {
	vi21 = vis_ld64((void *)src_a);
	vi22 = vis_ld64((void *)(src_a+8));
	vi23 = vis_ld64((void *)(src_a+16)); src_a += rb;
	vi11 = vis_ld64((void *)src_a);
	vi12 = vis_ld64((void *)(src_a+8));
	vi13 = vis_ld64((void *)(src_a+16)); src_a += rb;
	v21 = vis_faligndata(vi21,vi22);
	v22 = vis_faligndata(vi22,vi23);
	v11 = vis_faligndata(vi11,vi12);
	v12 = vis_faligndata(vi12,vi13);
	v21sh = vis_fpadd16(vis_fexpand(vis_read_hi(v21)),
			    vis_fmul8x16(vis_read_hi(vi22),scale));
	v21sl = vis_fpadd16(vis_fexpand(vis_read_lo(v21)),
			    vis_fmul8x16(vis_read_lo(vi22),scale));
	v22sh = vis_fpadd16(vis_fmul8x16(vis_read_hi(v22),scale),
			    vis_fmul8x16(vis_read_hi(vi23),scale));
	v22sl = vis_fpadd16(vis_fmul8x16(vis_read_lo(v22),scale),
			    vis_fmul8x16(vis_read_lo(vi23),scale));
	vr11 = vis_freg_pair(vis_fpack16(vis_fpadd16(v11sh,v21sh)),
			     vis_fpack16(vis_fpadd16(v11sl,v21sl)));
	vr12 = vis_freg_pair(vis_fpack16(vis_fpadd16(v12sh,v22sh)),
			     vis_fpack16(vis_fpadd16(v12sl,v22sl)));
	v11sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_hi(v11)),
			   vis_fexpand(vis_read_hi(vi12))));
	v11sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_lo(v11)),
			   vis_fexpand(vis_read_lo(vi12))));
	v12sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fmul8x16(vis_read_hi(v12),scale),
			   vis_fmul8x16(vis_read_hi(vi13),scale)));
	v12sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fmul8x16(vis_read_lo(v12),scale),
			   vis_fmul8x16(vis_read_lo(vi13),scale)));
	vr21 = vis_freg_pair(vis_fpack16(vis_fpadd16(v11sh,v21sh)),
			     vis_fpack16(vis_fpadd16(v11sl,v21sl)));
	vr22 = vis_freg_pair(vis_fpack16(vis_fpadd16(v12sh,v22sh)),
			     vis_fpack16(vis_fpadd16(v12sl,v22sl)));
	vis_st64(vr11, dst);
	vis_st64(vr12, dst + 8); dst += rb;
	vis_st64(vr21, dst);
	vis_st64(vr22, dst + 8); dst += rb;
      }
    }
    else {
      /*
       *  interpolate 16x16 hor&vert: neither src nor (src+1) 8-aligned
       */
      vis_d64 v11,v12,v1a,v1b,v11sh,v11sl,v12sh,v12sl,vr11,vr12,
	      v21,v22,v2a,v2b,v21sh,v21sl,v22sh,v22sl,vr21,vr22,
	      vi11,vi12,vi13,vi21,vi22,vi23;
      int align_l, align_m, off;

      src_a = vis_alignaddr(src,0);
      align_l = ((int)src & 7);
      align_m = align_l + 1;      /* no overflow */

      vi11 = vis_ld64((void *)src_a);
      vi12 = vis_ld64((void *)(src_a+8));
      vi13 = vis_ld64((void *)(src_a+16)); src_a += rb;
      v11 = vis_faligndata(vi11,vi12);
      v12 = vis_faligndata(vi12,vi13);
      vis_alignaddr(0,align_m);
      v1a = vis_faligndata(vi11,vi12);
      v1b = vis_faligndata(vi12,vi13);
      vis_alignaddr(0,align_l);
      v11sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v11)),
			 vis_fexpand(vis_read_hi(v1a))));
      v11sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_lo(v11)),
			 vis_fexpand(vis_read_lo(v1a))));
      v12sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v12)),
			 vis_fexpand(vis_read_hi(v1b))));
      v12sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fmul8x16(vis_read_lo(v12),scale),
			 vis_fmul8x16(vis_read_lo(v1b),scale)));

      for(i=8;i;i--) {
	vi21 = vis_ld64((void *)src_a);
	vi22 = vis_ld64((void *)(src_a+8));
	vi23 = vis_ld64((void *)(src_a+16)); src_a += rb;
	vi11 = vis_ld64((void *)src_a);
	vi12 = vis_ld64((void *)(src_a+8));
	vi13 = vis_ld64((void *)(src_a+16)); src_a += rb;
	v21 = vis_faligndata(vi21,vi22);
	v22 = vis_faligndata(vi22,vi23);
	v11 = vis_faligndata(vi11,vi12);
	v12 = vis_faligndata(vi12,vi13);
	vis_alignaddr(0,align_m);
	v2a = vis_faligndata(vi21,vi22);
	v2b = vis_faligndata(vi22,vi23);
	v1a = vis_faligndata(vi11,vi12);
	v1b = vis_faligndata(vi12,vi13);
	vis_alignaddr(0,align_l);
	v21sh = vis_fpadd16(vis_fexpand(vis_read_hi(v21)),
			    vis_fexpand(vis_read_hi(v2a)));
	v21sl = vis_fpadd16(vis_fexpand(vis_read_lo(v21)),
			    vis_fexpand(vis_read_lo(v2a)));
	v22sh = vis_fpadd16(vis_fmul8x16(vis_read_hi(v22),scale),
			    vis_fmul8x16(vis_read_hi(v2b),scale));
	v22sl = vis_fpadd16(vis_fmul8x16(vis_read_lo(v22),scale),
			    vis_fmul8x16(vis_read_lo(v2b),scale));
	vr11 = vis_freg_pair(vis_fpack16(vis_fpadd16(v11sh,v21sh)),
			     vis_fpack16(vis_fpadd16(v11sl,v21sl)));
	vr12 = vis_freg_pair(vis_fpack16(vis_fpadd16(v12sh,v22sh)),
			     vis_fpack16(vis_fpadd16(v12sl,v22sl)));
	v11sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fmul8x16(vis_read_hi(v11),scale),
			   vis_fmul8x16(vis_read_hi(v1a),scale)));
	v11sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fmul8x16(vis_read_lo(v11),scale),
			   vis_fmul8x16(vis_read_lo(v1a),scale)));
	v12sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fmul8x16(vis_read_hi(v12),scale),
			   vis_fmul8x16(vis_read_hi(v1b),scale)));
	v12sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fmul8x16(vis_read_lo(v12),scale),
			   vis_fmul8x16(vis_read_lo(v1b),scale)));
	vr21 = vis_freg_pair(vis_fpack16(vis_fpadd16(v11sh,v21sh)),
			     vis_fpack16(vis_fpadd16(v11sl,v21sl)));
	vr22 = vis_freg_pair(vis_fpack16(vis_fpadd16(v12sh,v22sh)),
			     vis_fpack16(vis_fpadd16(v12sl,v22sl)));
	vis_st64(vr11, dst);
	vis_st64(vr12, dst + 8); dst += rb;
	vis_st64(vr21, dst);
	vis_st64(vr22, dst + 8); dst += rb;
      }
    }
    break;
  }
}

/* ========================================================================== */

static void
CompensateHalfPelMotion8_vis(dst,src,xh,yh,rb)
Byte *src,*dst;
int xh,yh,rb;
{
  int j;
  Byte *src_a;               /* aligned source pointer */
  int i;                     /* loop counter */
  vis_d64 round,             /* addend for rounding */
          null = vis_fzero();

  switch ((xh<<1)|yh) {
  case 0:

    /* ================  COPY 8x8  ================== */


    src_a = vis_alignaddr(src, dst - dst);

    if(!((long)src & 7)) {         /* source spans only 1 quadword */
      vis_d64 v1,v2,v3,v4,v5,v6,v7,v8;

      v1 = vis_ld64((void *)src_a); src_a += rb;
      v2 = vis_ld64((void *)src_a); src_a += rb;
      v3 = vis_ld64((void *)src_a); src_a += rb;
      v4 = vis_ld64((void *)src_a); src_a += rb;
      v5 = vis_ld64((void *)src_a); src_a += rb;
      v6 = vis_ld64((void *)src_a); src_a += rb;
      v7 = vis_ld64((void *)src_a); src_a += rb;
      v8 = vis_ld64((void *)src_a);
      vis_st64(v1, dst); dst += rb;
      vis_st64(v2, dst); dst += rb;
      vis_st64(v3, dst); dst += rb;
      vis_st64(v4, dst); dst += rb;
      vis_st64(v5, dst); dst += rb;
      vis_st64(v6, dst); dst += rb;
      vis_st64(v7, dst); dst += rb;
      vis_st64(v8, dst);
    }
    else {
      /*
       *  Copy 8x8 block, src misaligned
       */
      vis_d64 v11,v12,v1l,v21,v22,v2l,v31,v32,v3l,v41,v42,v4l,
              v51,v52,v5l,v61,v62,v6l,v71,v72,v7l,v81,v82,v8l;

      v11 = vis_ld64((void *)src_a);
      v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v21 = vis_ld64((void *)src_a);
      v22 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v31 = vis_ld64((void *)src_a);
      v32 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v41 = vis_ld64((void *)src_a);
      v42 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v51 = vis_ld64((void *)src_a);
      v52 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v61 = vis_ld64((void *)src_a);
      v62 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v71 = vis_ld64((void *)src_a);
      v72 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v81 = vis_ld64((void *)src_a);
      v82 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v1l = vis_faligndata(v11,v12);
      v2l = vis_faligndata(v21,v22);
      v3l = vis_faligndata(v31,v32);
      v4l = vis_faligndata(v41,v42);
      v5l = vis_faligndata(v51,v52);
      v6l = vis_faligndata(v61,v62);
      v7l = vis_faligndata(v71,v72);
      v8l = vis_faligndata(v81,v82);
      vis_st64(v1l, dst); dst += rb;
      vis_st64(v2l, dst); dst += rb;
      vis_st64(v3l, dst); dst += rb;
      vis_st64(v4l, dst); dst += rb;
      vis_st64(v5l, dst); dst += rb;
      vis_st64(v6l, dst); dst += rb;
      vis_st64(v7l, dst); dst += rb;
      vis_st64(v8l, dst);
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 1:

    /* ================  INTERPOLATE 8x8 VERTICAL  ================== */


    round = vis_to_double_quad(1<<4);
    vis_write_gsr((7-4-1)<<3); /* set fpack16-scale to "shift 4+1 right" */
			       /* (the 4 originates from fexpand) */

    if(!((long)src & 7)) {         /* source spans only 1 quadword */
      vis_d64 v1,v2,v1h,v1l,v2h,v2l,vr1h,vr1l,vr2h,vr2l;
      vis_d64 v3,v4,v3h,v3l,v4h,v4l,vr3h,vr3l,vr4h,vr4l;

      v1 = vis_ld64((void *)src); src += rb;
      v1h = vis_fpadd16(round,vis_fexpand(vis_read_hi(v1)));
      v1l = vis_fpadd16(round,vis_fexpand(vis_read_lo(v1)));
      /*
       *  Unrolling this loop also avoids having to move the
       *  "look-ahead" (next value) to v1
       */
      for(i=2;i;i--) {
	v2 = vis_ld64((void *)src); src += rb;
	v3 = vis_ld64((void *)src); src += rb;
	v4 = vis_ld64((void *)src); src += rb;
	v1 = vis_ld64((void *)src); src += rb;
	v2h = vis_fexpand(vis_read_hi(v2));
	v2l = vis_fexpand(vis_read_lo(v2));
	v3h = vis_fpadd16(round,vis_fexpand(vis_read_hi(v3)));
	v3l = vis_fpadd16(round,vis_fexpand(vis_read_lo(v3)));
	v4h = vis_fexpand(vis_read_hi(v4));
	v4l = vis_fexpand(vis_read_lo(v4));
	vr1h = vis_fpadd16(v1h,v2h);
	vr1l = vis_fpadd16(v1l,v2l);
	v1h = vis_fpadd16(round,vis_fexpand(vis_read_hi(v1)));
	v1l = vis_fpadd16(round,vis_fexpand(vis_read_lo(v1)));
	vr2h = vis_fpadd16(v2h,v3h);
	vr2l = vis_fpadd16(v2l,v3l);
	vr3h = vis_fpadd16(v3h,v4h);
	vr3l = vis_fpadd16(v3l,v4l);
	vr4h = vis_fpadd16(v4h,v1h);
	vr4l = vis_fpadd16(v4l,v1l);
	vis_st64_pack2(vr1h, vr1l, dst); dst += rb;
	vis_st64_pack2(vr2h, vr2l, dst); dst += rb;
	vis_st64_pack2(vr3h, vr3l, dst); dst += rb;
	vis_st64_pack2(vr4h, vr4l, dst); dst += rb;
      }
    }
    else {
      vis_d64 v11,v12,v21,v22,v1,v2,v1h,v1l,v2h,v2l,vr1h,vr1l,vr2h,vr2l,
	      v31,v32,v41,v42,v3,v4,v3h,v3l,v4h,v4l,vr3h,vr3l,vr4h,vr4l;

      src_a = vis_alignaddr(src, dst - dst);

      v11 = vis_ld64((void *)src_a);
      v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v1 = vis_faligndata(v11,v12);
      v1h = vis_fpadd16(round,vis_fexpand(vis_read_hi(v1)));
      v1l = vis_fpadd16(round,vis_fexpand(vis_read_lo(v1)));

      for(i=2;i;i--) {
	v21 = vis_ld64((void *)src_a);
	v22 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v31 = vis_ld64((void *)src_a);
	v32 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v41 = vis_ld64((void *)src_a);
	v42 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v11 = vis_ld64((void *)src_a);
	v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v2 = vis_faligndata(v21,v22);
	v3 = vis_faligndata(v31,v32);
	v4 = vis_faligndata(v41,v42);
	v1 = vis_faligndata(v11,v12);
	v2h = vis_fexpand(vis_read_hi(v2));
	v2l = vis_fexpand(vis_read_lo(v2));
	v3h = vis_fpadd16(round,vis_fexpand(vis_read_hi(v3)));
	v3l = vis_fpadd16(round,vis_fexpand(vis_read_lo(v3)));
	v4h = vis_fexpand(vis_read_hi(v4));
	v4l = vis_fexpand(vis_read_lo(v4));
	vr1h = vis_fpadd16(v1h,v2h);
	vr1l = vis_fpadd16(v1l,v2l);
	v1h = vis_fpadd16(round,vis_fexpand(vis_read_hi(v1)));
	v1l = vis_fpadd16(round,vis_fexpand(vis_read_lo(v1)));
	vr2h = vis_fpadd16(v2h,v3h);
	vr2l = vis_fpadd16(v2l,v3l);
	vr3h = vis_fpadd16(v3h,v4h);
	vr3l = vis_fpadd16(v3l,v4l);
	vr4h = vis_fpadd16(v4h,v1h);
	vr4l = vis_fpadd16(v4l,v1l);
	vis_st64_pack2(vr1h,vr1l,dst); dst += rb;
	vis_st64_pack2(vr2h,vr2l,dst); dst += rb;
	vis_st64_pack2(vr3h,vr3l,dst); dst += rb;
	vis_st64_pack2(vr4h,vr4l,dst); dst += rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 2:

    /* ================  INTERPOLATE 8x8 HORIZONTAL  ================== */


    round = vis_to_double_quad(1<<4);
    vis_write_gsr((7-4-1)<<3); /* set fpack16-scale to "shift 4+1 right" */

    if(!((long)src & 7)) {         /* source spans only 1 quadword */
      /*
       *  interpolate horizontally: src 8-aligned
       */
      vis_d64 v11,v1n,v12,v11h,v11l,v12h,v12l,vr1l,vr1h,vr1;
      vis_d64 v21,v2n,v22,v21h,v21l,v22h,v22l,vr2l,vr2h,vr2;

      vis_alignaddr_const(1);

      for(i=4;i;i--) {
	v11 = vis_ld64((void *)src);
	v1n = vis_ld64((void *)(src + 8)); src += rb;
	v21 = vis_ld64((void *)src);
	v2n = vis_ld64((void *)(src + 8)); src += rb;
	v12 = vis_faligndata(v11,v1n);
	v22 = vis_faligndata(v21,v2n);
	v11h = vis_fexpand(vis_read_hi(v11));
	v11l = vis_fexpand(vis_read_lo(v11));
	v12h = vis_fexpand(vis_read_hi(v12));
	v12l = vis_fexpand(vis_read_lo(v12));
	v21h = vis_fexpand(vis_read_hi(v21));
	v21l = vis_fexpand(vis_read_lo(v21));
	v22h = vis_fexpand(vis_read_hi(v22));
	v22l = vis_fexpand(vis_read_lo(v22));
	vr1h = vis_fpadd16(v11h,vis_fpadd16(v12h,round));
	vr1l = vis_fpadd16(v11l,vis_fpadd16(v12l,round));
	vr2h = vis_fpadd16(v21h,vis_fpadd16(v22h,round));
	vr2l = vis_fpadd16(v21l,vis_fpadd16(v22l,round));
	vr1 = vis_freg_pair(vis_fpack16(vr1h),vis_fpack16(vr1l));
	vr2 = vis_freg_pair(vis_fpack16(vr2h),vis_fpack16(vr2l));
	vis_st64(vr1, dst);
	vis_st64(vr2, dst+rb); dst += 2*rb;
      }
    }
    else if(((int)src & 7) == 7) {
      /*
       *  interpolate horizontally: src not aligned, (src+1) aligned
       */
      vis_d64 v11,v12,v21,v22,v31,v32,v41,v42,v1l,v2l,v3l,v4l,v1r,v2r,v3r,v4r,
	      v1lh,v1ll,v1rh,v1rl,v2lh,v2ll,v2rh,v2rl,
	      v3lh,v3ll,v3rh,v3rl,v4lh,v4ll,v4rh,v4rl, vr1,vr2,vr3,vr4;

      /*
       *  These loops have to be unrolled that much to get enough distance
       *  between the alignaddr and faligndata commands (can't execute
       *  faligndata before the new %gsr has been set) - however the
       *  SUNWspro4.0 compiler does not always honor this intention.
       */
      src_a = vis_alignaddr(src,0);

      for(i=2;i;i--) {
	v11 = vis_ld64((void *)src_a);
	v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v21 = vis_ld64((void *)src_a);
	v22 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v31 = vis_ld64((void *)src_a);
	v32 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v41 = vis_ld64((void *)src_a);
	v42 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v1l = vis_faligndata(v11,v12);
	v2l = vis_faligndata(v21,v22);
	v3l = vis_faligndata(v31,v32);
	v4l = vis_faligndata(v41,v42);
	v1lh = vis_fpadd16(round,vis_fexpand(vis_read_hi(v1l)));
	v1ll = vis_fpadd16(round,vis_fexpand(vis_read_lo(v1l)));
	v2lh = vis_fpadd16(round,vis_fexpand(vis_read_hi(v2l)));
	v2ll = vis_fpadd16(round,vis_fexpand(vis_read_lo(v2l)));
	v3lh = vis_fpadd16(round,vis_fexpand(vis_read_hi(v3l)));
	v3ll = vis_fpadd16(round,vis_fexpand(vis_read_lo(v3l)));
	v4lh = vis_fpadd16(round,vis_fexpand(vis_read_hi(v4l)));
	v4ll = vis_fpadd16(round,vis_fexpand(vis_read_lo(v4l)));
	v1rh = vis_fexpand(vis_read_hi(v12));
	v1rl = vis_fexpand(vis_read_lo(v12));
	v2rh = vis_fexpand(vis_read_hi(v22));
	v2rl = vis_fexpand(vis_read_lo(v22));
	v3rh = vis_fexpand(vis_read_hi(v32));
	v3rl = vis_fexpand(vis_read_lo(v32));
	v4rh = vis_fexpand(vis_read_hi(v42));
	v4rl = vis_fexpand(vis_read_lo(v42));
	vr1 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v1lh,v1rh)),
		vis_fpack16(vis_fpadd16(v1ll,v1rl)));
	vr2 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v2lh,v2rh)),
		vis_fpack16(vis_fpadd16(v2ll,v2rl)));
	vr3 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v3lh,v3rh)),
		vis_fpack16(vis_fpadd16(v3ll,v3rl)));
	vr4 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v4lh,v4rh)),
		vis_fpack16(vis_fpadd16(v4ll,v4rl)));
	vis_st64(vr1, dst);
	vis_st64(vr2, dst+rb); dst += 2*rb;
	vis_st64(vr3, dst);
	vis_st64(vr4, dst+rb); dst += 2*rb;
      }
    }
    else {
      /*
       *  interpolate horizontally: src not aligned
       */
      vis_d64 v11,v12,v21,v22,v31,v32,v41,v42,v1l,v2l,v3l,v4l,v1r,v2r,v3r,v4r,
	      v1lh,v1ll,v1rh,v1rl,v2lh,v2ll,v2rh,v2rl,
	      v3lh,v3ll,v3rh,v3rl,v4lh,v4ll,v4rh,v4rl, vr1,vr2,vr3,vr4;
      int align_r, off;

      /*
       *  These loops have to be unrolled that much to get enough distance
       *  between the alignaddr and faligndata commands (can't execute
       *  faligndata before the new %gsr has been set) - however the
       *  SUNWspro4.0 compiler does not always honor this intention.
       */
      src_a = vis_alignaddr(src,0);
      align_r = ((int)src & 7) + 1;   /* results to 2..7 */

      for(i=2;i;i--) {
	v11 = vis_ld64((void *)src_a);
	v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v21 = vis_ld64((void *)src_a);
	v22 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v31 = vis_ld64((void *)src_a);
	v32 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v41 = vis_ld64((void *)src_a);
	v42 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v1l = vis_faligndata(v11,v12);
	v2l = vis_faligndata(v21,v22);
	v3l = vis_faligndata(v31,v32);
	v4l = vis_faligndata(v41,v42);
	vis_alignaddr_const(align_r);
	v1lh = vis_fpadd16(round,vis_fexpand(vis_read_hi(v1l)));
	v1ll = vis_fpadd16(round,vis_fexpand(vis_read_lo(v1l)));
	v2lh = vis_fpadd16(round,vis_fexpand(vis_read_hi(v2l)));
	v2ll = vis_fpadd16(round,vis_fexpand(vis_read_lo(v2l)));
	v3lh = vis_fpadd16(round,vis_fexpand(vis_read_hi(v3l)));
	v3ll = vis_fpadd16(round,vis_fexpand(vis_read_lo(v3l)));
	v4lh = vis_fpadd16(round,vis_fexpand(vis_read_hi(v4l)));
	v4ll = vis_fpadd16(round,vis_fexpand(vis_read_lo(v4l)));
	v1r = vis_faligndata(v11,v12);
	v2r = vis_faligndata(v21,v22);
	v3r = vis_faligndata(v31,v32);
	v4r = vis_faligndata(v41,v42);
	vis_alignaddr_const((int)src);
	v1rh = vis_fexpand(vis_read_hi(v1r));
	v1rl = vis_fexpand(vis_read_lo(v1r));
	v2rh = vis_fexpand(vis_read_hi(v2r));
	v2rl = vis_fexpand(vis_read_lo(v2r));
	v3rh = vis_fexpand(vis_read_hi(v3r));
	v3rl = vis_fexpand(vis_read_lo(v3r));
	v4rh = vis_fexpand(vis_read_hi(v4r));
	v4rl = vis_fexpand(vis_read_lo(v4r));
	vr1 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v1lh,v1rh)),
		vis_fpack16(vis_fpadd16(v1ll,v1rl)));
	vr2 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v2lh,v2rh)),
		vis_fpack16(vis_fpadd16(v2ll,v2rl)));
	vr3 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v3lh,v3rh)),
		vis_fpack16(vis_fpadd16(v3ll,v3rl)));
	vr4 = vis_freg_pair(
		vis_fpack16(vis_fpadd16(v4lh,v4rh)),
		vis_fpack16(vis_fpadd16(v4ll,v4rl)));
	vis_st64(vr1, dst); dst += rb;
	vis_st64(vr2, dst); dst += rb;
	vis_st64(vr3, dst); dst += rb;
	vis_st64(vr4, dst); dst += rb;
      }
    }
    break;
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  case 3:

    /* ============  INTERPOLATE 8x8 HORIZONTAL & VERTICAL  ============= */


    round = vis_to_double_quad(2<<4);
    vis_write_gsr((7-4-2)<<3); /* set fpack16-scale to "shift 4+2 right" */
			       /* (the 4 is due to fexpand) */

    if(!((long)src & 7)) {
      /*
       *  interpolate 8x8 hor&vert: src 8-aligned
       */
      vis_d64 v11,v1n,v12,v1sh,v1sl,vr1,
	      v21,v2n,v22,v2sh,v2sl,vr2;

      vis_alignaddr(0,1);

      v11 = vis_ld64((void *)src);
      v1n = vis_ld64((void *)(src + 8)); src += rb;
      v12 = vis_faligndata(v11,v1n);
      v1sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v11)),
			 vis_fexpand(vis_read_hi(v12))));
      v1sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_lo(v11)),
			 vis_fexpand(vis_read_lo(v12))));

      for(i=4;i;i--) {
	v21 = vis_ld64((void *)src);
	v2n = vis_ld64((void *)(src + 8)); src += rb;
	v22 = vis_faligndata(v21,v2n);
	v2sh = vis_fpadd16(vis_fexpand(vis_read_hi(v21)),
			   vis_fexpand(vis_read_hi(v22)));
	v2sl = vis_fpadd16(vis_fexpand(vis_read_lo(v21)),
			   vis_fexpand(vis_read_lo(v22)));
	vr1 = vis_freg_pair(vis_fpack16(vis_fpadd16(v1sh,v2sh)),
			    vis_fpack16(vis_fpadd16(v1sl,v2sl)));
	v11 = vis_ld64((void *)src);
	v1n = vis_ld64((void *)(src + 8)); src += rb;
	v12 = vis_faligndata(v11,v1n);
	v1sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_hi(v11)),
			   vis_fexpand(vis_read_hi(v12))));
	v1sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_lo(v11)),
			   vis_fexpand(vis_read_lo(v12))));
	vr2 = vis_freg_pair(vis_fpack16(vis_fpadd16(v1sh,v2sh)),
			    vis_fpack16(vis_fpadd16(v1sl,v2sl)));
	vis_st64(vr1, dst); dst += rb;
	vis_st64(vr2, dst); dst += rb;
      }
    }
    else if(((int)src & 7) == 7) {
      /*
       *  interpolate 8x8 hor&vert: src misaligned, but (src+1) aligned
       */
      vis_d64 v11,v12,v21,v22,v1a,v1b,v2a,v2b,v1sh,v1sl,v2sh,v2sl,vr1,vr2,
	      vr1l,vr1r,vr2l,vr2r;
      int align_l, align_m, off;

      src_a = vis_alignaddr(src,0);

      v11 = vis_ld64((void *)src_a);
      v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v1a = vis_faligndata(v11,v12);
      v1b = v12;
      v1sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v1a)),
			 vis_fexpand(vis_read_hi(v1b))));
      v1sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_lo(v1a)),
			 vis_fexpand(vis_read_lo(v1b))));

      for(i=4;i;i--) {
	v21 = vis_ld64((void *)src_a);
	v22 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v2a = vis_faligndata(v21,v22);
	v2b = v22;
	v2sh = vis_fpadd16(vis_fexpand(vis_read_hi(v2a)),
			   vis_fexpand(vis_read_hi(v2b)));
	v2sl = vis_fpadd16(vis_fexpand(vis_read_lo(v2a)),
			   vis_fexpand(vis_read_lo(v2b)));
	vr1 = vis_freg_pair(vis_fpack16(vis_fpadd16(v1sh,v2sh)),
			    vis_fpack16(vis_fpadd16(v1sl,v2sl)));
	v11 = vis_ld64((void *)src_a);
	v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v1a = vis_faligndata(v11,v12);
	v1b = v12;
	v1sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_hi(v1a)),
			   vis_fexpand(vis_read_hi(v1b))));
	v1sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_lo(v1a)),
			   vis_fexpand(vis_read_lo(v1b))));
	vr2 = vis_freg_pair(vis_fpack16(vis_fpadd16(v1sh,v2sh)),
			    vis_fpack16(vis_fpadd16(v1sl,v2sl)));
	vis_st64(vr1, dst); dst += rb;
	vis_st64(vr2, dst); dst += rb;
      }
    }
    else {
      /*
       *  interpolate 8x8 hor&vert: src and (src+1) not 8-aligned
       */
      vis_d64 v11,v12,v21,v22,v1a,v1b,v2a,v2b,v1sh,v1sl,v2sh,v2sl,vr1,vr2,
	      vr1l,vr1r,vr2l,vr2r;
      int align_l, align_m, off;

      src_a = vis_alignaddr(src,0);
      align_l = ((int)src & 7);
      align_m = align_l + 1;     /* no overflow (>7) */

      v11 = vis_ld64((void *)src_a);
      v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
      v1a = vis_faligndata(v11,v12);
      vis_alignaddr_const(align_m);
      v1b = vis_faligndata(v11,v12);
      vis_alignaddr_const(align_l);
      v1sh = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_hi(v1a)),
			 vis_fexpand(vis_read_hi(v1b))));
      v1sl = vis_fpadd16(round,vis_fpadd16(
			 vis_fexpand(vis_read_lo(v1a)),
			 vis_fexpand(vis_read_lo(v1b))));

      for(i=4;i;i--) {
	v21 = vis_ld64((void *)src_a);
	v22 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v11 = vis_ld64((void *)src_a);
	v12 = vis_ld64((void *)(src_a+8)); src_a += rb;
	v2a = vis_faligndata(v21,v22);
	v1a = vis_faligndata(v11,v12);
	vis_alignaddr_const(align_m);
	v2sh = vis_fexpand(vis_read_hi(v2a));
	v2sl = vis_fexpand(vis_read_lo(v2a));
	v2b = vis_faligndata(v21,v22);
	v1b = vis_faligndata(v11,v12);
	vis_alignaddr_const(align_l);
	v2sh = vis_fpadd16(v2sh, vis_fexpand(vis_read_hi(v2b)));
	v2sl = vis_fpadd16(v2sl, vis_fexpand(vis_read_lo(v2b)));
	vr1 = vis_freg_pair(vis_fpack16(vis_fpadd16(v1sh,v2sh)),
			    vis_fpack16(vis_fpadd16(v1sl,v2sl)));
	v1sh = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_hi(v1a)),
			   vis_fexpand(vis_read_hi(v1b))));
	v1sl = vis_fpadd16(round,vis_fpadd16(
			   vis_fexpand(vis_read_lo(v1a)),
			   vis_fexpand(vis_read_lo(v1b))));
	vr2 = vis_freg_pair(vis_fpack16(vis_fpadd16(v1sh,v2sh)),
			    vis_fpack16(vis_fpadd16(v1sl,v2sl)));
	vis_st64(vr1, dst); dst += rb;
	vis_st64(vr2, dst); dst += rb;
      }
    }
    break;
  }
}
/* ========================================================================== */

/*
 *  Copy 8x8 U,V and 16x16 Y block. All addresses are 8-byte aligned.
 */

static void
CopyMB16_vis(ydst, udst, vdst, ysrc, usrc, vsrc)
Byte *ydst, *udst, *vdst, *ysrc, *usrc, *vsrc;
{
  vis_d64 v1,v2,v3,v4,v5,v6,v7,v8,
          u1,u2,u3,u4,u5,u6,u7,u8,
          y11,y12,y21,y22,y31,y32,y41,y42;

  y11 = vis_ld64((void *)ysrc);
  y12 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  u1 = vis_ld64((void *)usrc); usrc += Crowbytes;
  y21 = vis_ld64((void *)ysrc);
  y22 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  v1 = vis_ld64((void *)vsrc); vsrc += Crowbytes;
  y31 = vis_ld64((void *)ysrc);
  y32 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  u2 = vis_ld64((void *)usrc); usrc += Crowbytes;
  y41 = vis_ld64((void *)ysrc);
  y42 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  v2 = vis_ld64((void *)vsrc); vsrc += Crowbytes;
  vis_st64(y11, ydst);
  vis_st64(y12, ydst+8); ydst += Yrowbytes;
  vis_st64(y21, ydst);
  vis_st64(y22, ydst+8); ydst += Yrowbytes;
  vis_st64(y31, ydst);
  vis_st64(y32, ydst+8); ydst += Yrowbytes;
  vis_st64(y41, ydst);
  vis_st64(y42, ydst+8); ydst += Yrowbytes;
  y11 = vis_ld64((void *)ysrc);
  y12 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  u3 = vis_ld64((void *)usrc); usrc += Crowbytes;
  y21 = vis_ld64((void *)ysrc);
  y22 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  v3 = vis_ld64((void *)vsrc); vsrc += Crowbytes;
  y31 = vis_ld64((void *)ysrc);
  y32 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  u4 = vis_ld64((void *)usrc); usrc += Crowbytes;
  y41 = vis_ld64((void *)ysrc);
  y42 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  v4 = vis_ld64((void *)vsrc); vsrc += Crowbytes;
  vis_st64(y11, ydst);
  vis_st64(y12, ydst+8); ydst += Yrowbytes;
  vis_st64(y21, ydst);
  vis_st64(y22, ydst+8); ydst += Yrowbytes;
  vis_st64(y31, ydst);
  vis_st64(y32, ydst+8); ydst += Yrowbytes;
  vis_st64(y41, ydst);
  vis_st64(y42, ydst+8); ydst += Yrowbytes;
  y11 = vis_ld64((void *)ysrc);
  y12 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  u5 = vis_ld64((void *)usrc); usrc += Crowbytes;
  y21 = vis_ld64((void *)ysrc);
  y22 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  v5 = vis_ld64((void *)vsrc); vsrc += Crowbytes;
  y31 = vis_ld64((void *)ysrc);
  y32 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  u6 = vis_ld64((void *)usrc); usrc += Crowbytes;
  y41 = vis_ld64((void *)ysrc);
  y42 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  v6 = vis_ld64((void *)vsrc); vsrc += Crowbytes;
  vis_st64(y11, ydst);
  vis_st64(y12, ydst+8); ydst += Yrowbytes;
  vis_st64(y21, ydst);
  vis_st64(y22, ydst+8); ydst += Yrowbytes;
  vis_st64(y31, ydst);
  vis_st64(y32, ydst+8); ydst += Yrowbytes;
  vis_st64(y41, ydst);
  vis_st64(y42, ydst+8); ydst += Yrowbytes;
  y11 = vis_ld64((void *)ysrc);
  y12 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  u7 = vis_ld64((void *)usrc); usrc += Crowbytes;
  y21 = vis_ld64((void *)ysrc);
  y22 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  v7 = vis_ld64((void *)vsrc); vsrc += Crowbytes;
  y31 = vis_ld64((void *)ysrc);
  y32 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  u8 = vis_ld64((void *)usrc);
  y41 = vis_ld64((void *)ysrc);
  y42 = vis_ld64((void *)(ysrc+8)); ysrc += Yrowbytes;
  v8 = vis_ld64((void *)vsrc);
  vis_st64(u1, udst); udst += Crowbytes;
  vis_st64(v1, vdst); vdst += Crowbytes;
  vis_st64(u2, udst); udst += Crowbytes;
  vis_st64(v2, vdst); vdst += Crowbytes;
  vis_st64(u3, udst); udst += Crowbytes;
  vis_st64(v3, vdst); vdst += Crowbytes;
  vis_st64(u4, udst); udst += Crowbytes;
  vis_st64(v4, vdst); vdst += Crowbytes;
  vis_st64(u5, udst); udst += Crowbytes;
  vis_st64(v5, vdst); vdst += Crowbytes;
  vis_st64(u6, udst); udst += Crowbytes;
  vis_st64(v6, vdst); vdst += Crowbytes;
  vis_st64(u7, udst); udst += Crowbytes;
  vis_st64(v7, vdst); vdst += Crowbytes;
  vis_st64(u8, udst);
  vis_st64(v8, vdst);
  vis_st64(y11, ydst);
  vis_st64(y12, ydst+8); ydst += Yrowbytes;
  vis_st64(y21, ydst);
  vis_st64(y22, ydst+8); ydst += Yrowbytes;
  vis_st64(y31, ydst);
  vis_st64(y32, ydst+8); ydst += Yrowbytes;
  vis_st64(y41, ydst);
  vis_st64(y42, ydst+8);

}

#endif  /* !WITHOUT_VIS */

void CompensateHalfPelMotion16(Global *this, Byte *dec, Byte *ref, int xh, int yh, int rb) {
  int j;
  double *dd,*dr;
  unsigned long *d,*r;

#ifndef WITHOUT_VIS
  if (this->Options.use_vis)
    {
      CompensateHalfPelMotion16_vis(dec, ref, xh, yh, rb);
      return;
    }
#endif

  switch ((xh<<1)|yh) {

    /* ================  COPY 16x16 ================== */

  case 0:
#ifdef CLASSIC
    for (j=0; j<16; j++) {
      dec[0] = ref[0]; dec[1] = ref[1]; dec[2] = ref[2]; dec[3] = ref[3];
      dec[4] = ref[4]; dec[5] = ref[5]; dec[6] = ref[6]; dec[7] = ref[7];
      dec[8] = ref[8]; dec[9] = ref[9]; dec[10] = ref[10]; dec[11] = ref[11];
      dec[12]= ref[12];dec[13]= ref[13];dec[14] = ref[14]; dec[15] = ref[15];
      ref += rb;
      dec += rb;
    }
#else /* not CLASSIC */
    switch((long)ref & 7) {
    case 0:
      /*  Source-block is 8-Byte aligned. Can use double-precision floating-
       *  point registers for 64-Bit memory transfers. This is the fastest
       *  way for copying on Sparc-V8 processors.
       */
      dd=(double*)dec;
      dr=(double*)ref;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dd[1] = dr[1]; dr += rb/8; dd += rb/8;
      break;
    case 4:                                                 /* 32-Bit copies */
      /*  Source is only 4-Byte aligned. Can't use FP-Registers, since
       *  there is no way to use Shift/OR on thes for alignment handling
       *  (at least not on V8 processors - on V9 we use VIS)
       *  Use 2nd fastest way for copying: 32-bit copies through integer-regs.
       */
      d=(unsigned long*)dec;
      r=(unsigned long*)ref;
      for (j=16; j; j--) {
	d[0] = r[0]; d[1] = r[1]; d[2] = r[2]; d[3] = r[3];
	r += rb/4; d += rb/4;
      }
      break;
    case 5:
    case 1:
      d=(unsigned long*)dec, r=(unsigned long*)((long)ref&~3);
      for (j=16; j; j--) {
	unsigned long r1 = r[0], r2 = r[1], r3 = r[2], r4 = r[3], r5 = r[4];
	r += rb/4;
	d[0] = (r1<<8)|(r2>>24);
	d[1] = (r2<<8)|(r3>>24);
	d[2] = (r3<<8)|(r4>>24);
	d[3] = (r4<<8)|(r5>>24);
	d += rb/4;
      }
      break;
    case 6:
    case 2:
      d=(unsigned long*)dec, r=(unsigned long*)((long)ref&~3);
      for (j=16; j; j--) {
	unsigned long r1 = r[0], r2 = r[1], r3 = r[2], r4 = r[3], r5 = r[4];
	r += rb/4;
	d[0] = (r1<<16)|(r2>>16);
	d[1] = (r2<<16)|(r3>>16);
	d[2] = (r3<<16)|(r4>>16);
	d[3] = (r4<<16)|(r5>>16);
	d += rb/4;
      }
      break;
    case 7:
    case 3:
      d=(unsigned long*)dec, r=(unsigned long*)((long)ref&~3);
      for (j=16; j; j--) {
	unsigned long r1 = r[0], r2 = r[1], r3 = r[2], r4 = r[3], r5 = r[4];
	r += rb/4;
	d[0] = (r1<<24)|(r2>>8);
	d[1] = (r2<<24)|(r3>>8);
	d[2] = (r3<<24)|(r4>>8);
	d[3] = (r4<<24)|(r5>>8);
	d += rb/4;
      }
      break;
    }
#endif /* CLASSIC */
    break;

    /* ================  INTERPOLATE 16x16 VERTICAL ================== */

  case 1:
#ifdef CLASSIC
    ref2 = ref + rb;
    for (j=0; j<16; j++) {
      dec[0] = (unsigned int)(ref[0]+ref2[0]+1)>>1;
      dec[1] = (unsigned int)(ref[1]+ref2[1]+1)>>1;
      dec[2] = (unsigned int)(ref[2]+ref2[2]+1)>>1;
      dec[3] = (unsigned int)(ref[3]+ref2[3]+1)>>1;
      dec[4] = (unsigned int)(ref[4]+ref2[4]+1)>>1;
      dec[5] = (unsigned int)(ref[5]+ref2[5]+1)>>1;
      dec[6] = (unsigned int)(ref[6]+ref2[6]+1)>>1;
      dec[7] = (unsigned int)(ref[7]+ref2[7]+1)>>1;
      dec[8] = (unsigned int)(ref[8]+ref2[8]+1)>>1;
      dec[9] = (unsigned int)(ref[9]+ref2[9]+1)>>1;
      dec[10] = (unsigned int)(ref[10]+ref2[10]+1)>>1;
      dec[11] = (unsigned int)(ref[11]+ref2[11]+1)>>1;
      dec[12] = (unsigned int)(ref[12]+ref2[12]+1)>>1;
      dec[13] = (unsigned int)(ref[13]+ref2[13]+1)>>1;
      dec[14] = (unsigned int)(ref[14]+ref2[14]+1)>>1;
      dec[15] = (unsigned int)(ref[15]+ref2[15]+1)>>1;
      ref += rb;
      ref2 += rb;
      dec += rb;
    }
#else /* not CLASSIC */
    d=(unsigned long*)dec;
#ifndef SLOW
    /*
     *  Optimized with binary arithmetic: add 2 pixel bytes in parallel in
     *  one 32-bit integer register. One register holds byte 0 and 2, the
     *  other 1 and 3. Inbetween 8 bit are free for overflow on addition.
     *  To achieve division by two, the higher 2 bytes are shifted right
     *  before addition, the lower ones afterwords. The 2 results are
     *  then masked to their 2x8 bit areas, ORed and written as 32 bit
     *  to memory.
     *
     *  Further speedup is achieved by unrolling the loop once. The
     *  loaded and prepared for addition values of the last scanline are
     *  kept in registers between passes of the loop.
     *
     *  Misalignment requires to assemble the 2-byte operands from 2
     *  original 32-bit words. This is donw by shifting and ORing.
     *  Each misalignment-offset is implemented seperately (case 1,2,3).
     */
    switch((long)ref & 3) {
    case 0:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l,
		    v30h,v30l,v31h,v31l, v40h,v40l,v41h,v41l;
      r=(unsigned long*)ref;
      v10h = (r[0]>>1) & 0x7f807f80;
      v10l = r[0] & 0xff00ff;
      v20h = (r[1]>>1) & 0x7f807f80;
      v20l = r[1] & 0xff00ff;
      v30h = (r[2]>>1) & 0x7f807f80;
      v30l = r[2] & 0xff00ff;
      v40h = (r[3]>>1) & 0x7f807f80;
      v40l = r[3] & 0xff00ff;
      r += rb/4;
      for (j=8; j; j--) {
	v11h = ((r[0]>>1) & 0x7f807f80)+ 0x800080;
	v11l = (r[0] & 0xff00ff) + 0x10001;
	v21h = ((r[1]>>1) & 0x7f807f80)+ 0x800080;
	v21l = (r[1] & 0xff00ff) + 0x10001;
	v31h = ((r[2]>>1) & 0x7f807f80)+ 0x800080;
	v31l = (r[2] & 0xff00ff) + 0x10001;
	v41h = ((r[3]>>1) & 0x7f807f80)+ 0x800080;
	v41l = (r[3] & 0xff00ff) + 0x10001;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x1fe01fe) >>1);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x1fe01fe) >>1);
	d += rb/4;
	v10h = (r[0]>>1) & 0x7f807f80;
	v10l = r[0] & 0xff00ff;
	v20h = (r[1]>>1) & 0x7f807f80;
	v20l = r[1] & 0xff00ff;
	v30h = (r[2]>>1) & 0x7f807f80;
	v30l = r[2] & 0xff00ff;
	v40h = (r[3]>>1) & 0x7f807f80;
	v40l = r[3] & 0xff00ff;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x1fe01fe) >>1);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x1fe01fe) >>1);
	d += rb/4;
      }
    }
    break;
    case 1:
    {
      /*
       *  Misalignment = 1.
       *  This allows to spare the Shift and OR on the upper sum.
       */
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l,
		    v30h,v30l,v31h,v31l, v40h,v40l,v41h,v41l,
		    r1,r2,r3,r4,r5;
      r=(unsigned long*)((long)ref & ~3);

      r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
      v10h = (r1<<7) & 0x7f807f80;
      v10l = ((r1<<8)|(r2>>24)) & 0xff00ff;
      v20h = (r2<<7) & 0x7f807f80;
      v20l = ((r2<<8)|(r3>>24)) & 0xff00ff;
      v30h = (r3<<7) & 0x7f807f80;
      v30l = ((r3<<8)|(r4>>24)) & 0xff00ff;
      v40h = (r4<<7) & 0x7f807f80;
      v40l = ((r4<<8)|(r5>>24)) & 0xff00ff;
      r += rb/4;
      for (j=8; j; j--) {
	r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
	v11h = ((r1<<7) & 0x7f807f80) + 0x800080;
	v11l = (((r1<<8)|(r2>>24)) & 0xff00ff) + 0x10001;
	v21h = ((r2<<7) & 0x7f807f80) + 0x800080;
	v21l = (((r2<<8)|(r3>>24)) & 0xff00ff) + 0x10001;
	v31h = ((r3<<7) & 0x7f807f80) + 0x800080;
	v31l = (((r3<<8)|(r4>>24)) & 0xff00ff) + 0x10001;
	v41h = ((r4<<7) & 0x7f807f80) + 0x800080;
	v41l = (((r4<<8)|(r5>>24)) & 0xff00ff) + 0x10001;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x1fe01fe) >>1);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x1fe01fe) >>1);
	d += rb/4;
	r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
	v10h = (r1<<7) & 0x7f807f80;
	v10l = ((r1<<8)|(r2>>24)) & 0xff00ff;
	v20h = (r2<<7) & 0x7f807f80;
	v20l = ((r2<<8)|(r3>>24)) & 0xff00ff;
	v30h = (r3<<7) & 0x7f807f80;
	v30l = ((r3<<8)|(r4>>24)) & 0xff00ff;
	v40h = (r4<<7) & 0x7f807f80;
	v40l = ((r4<<8)|(r5>>24)) & 0xff00ff;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x1fe01fe) >>1);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x1fe01fe) >>1);
	d += rb/4;
      }
    }
    break;
    case 2:
    {
      /*
       *  Misalignment = 2.  Have to Shift and OR every operand.
       */
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l,
		    v30h,v30l,v31h,v31l, v40h,v40l,v41h,v41l,
		    r1,r2,r3,r4,r5;
      int le=16,re=16;
      r=(unsigned long*)((long)ref & ~3);

      r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
      v10h = ((r1<<15)|(r2>>17)) & 0x7f807f80;
      v10l = ((r1<<16)|(r2>>16)) & 0xff00ff;
      v20h = ((r2<<15)|(r3>>17)) & 0x7f807f80;
      v20l = ((r2<<16)|(r3>>16)) & 0xff00ff;
      v30h = ((r3<<15)|(r4>>17)) & 0x7f807f80;
      v30l = ((r3<<16)|(r4>>16)) & 0xff00ff;
      v40h = ((r4<<15)|(r5>>17)) & 0x7f807f80;
      v40l = ((r4<<16)|(r5>>16)) & 0xff00ff;
      r += rb/4;
      for (j=8; j; j--) {
	r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
	v11h = ((((r1<<le)|(r2>>re))>>1) & 0x7f807f80)+ 0x800080;
	v11l = (((r1<<le)|(r2>>re)) & 0xff00ff) + 0x10001;
	v21h = ((((r2<<le)|(r3>>re))>>1) & 0x7f807f80)+ 0x800080;
	v21l = (((r2<<le)|(r3>>re)) & 0xff00ff) + 0x10001;
	v31h = ((((r3<<le)|(r4>>re))>>1) & 0x7f807f80)+ 0x800080;
	v31l = (((r3<<le)|(r4>>re)) & 0xff00ff) + 0x10001;
	v41h = ((((r4<<le)|(r5>>re))>>1) & 0x7f807f80)+ 0x800080;
	v41l = (((r4<<le)|(r5>>re)) & 0xff00ff) + 0x10001;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x1fe01fe) >>1);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x1fe01fe) >>1);
	d += rb/4;
	r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
	v10h = (((r1<<le)|(r2>>re))>>1) & 0x7f807f80;
	v10l = ((r1<<le)|(r2>>re)) & 0xff00ff;
	v20h = (((r2<<le)|(r3>>re))>>1) & 0x7f807f80;
	v20l = ((r2<<le)|(r3>>re)) & 0xff00ff;
	v30h = (((r3<<le)|(r4>>re))>>1) & 0x7f807f80;
	v30l = ((r3<<le)|(r4>>re)) & 0xff00ff;
	v40h = (((r4<<le)|(r5>>re))>>1) & 0x7f807f80;
	v40l = ((r4<<le)|(r5>>re)) & 0xff00ff;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x1fe01fe) >>1);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x1fe01fe) >>1);
	d += rb/4;
      }
    }
    break;
    case 3:
    {
      /*
       *  Spare one Shift and OR on lower sum
       */
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l,
		    v30h,v30l,v31h,v31l, v40h,v40l,v41h,v41l,
		    r1,r2,r3,r4,r5;
      r=(unsigned long*)((long)ref & ~3);

      r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
      v10h = ((r1<<23)|(r2>>9)) & 0x7f807f80;
      v10l = (r2>>8) & 0xff00ff;
      v20h = ((r2<<23)|(r3>>9)) & 0x7f807f80;
      v20l = (r3>>8) & 0xff00ff;
      v30h = ((r3<<23)|(r4>>9)) & 0x7f807f80;
      v30l = (r4>>8) & 0xff00ff;
      v40h = ((r4<<23)|(r5>>9)) & 0x7f807f80;
      v40l = (r5>>8) & 0xff00ff;
      r += rb/4;
      for (j=8; j; j--) {
	r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
	v11h = (((r1<<23)|(r2>>9)) & 0x7f807f80) + 0x800080;
	v11l = ((r2>>8) & 0xff00ff) + 0x10001;
	v21h = (((r2<<23)|(r3>>9)) & 0x7f807f80) + 0x800080;
	v21l = ((r3>>8) & 0xff00ff) + 0x10001;
	v31h = (((r3<<23)|(r4>>9)) & 0x7f807f80) + 0x800080;
	v31l = ((r4>>8) & 0xff00ff) + 0x10001;
	v41h = (((r4<<23)|(r5>>9)) & 0x7f807f80) + 0x800080;
	v41l = ((r5>>8) & 0xff00ff) + 0x10001;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x1fe01fe) >>1);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x1fe01fe) >>1);
	d += rb/4;
	r1 = r[0]; r2 = r[1]; r3 = r[2]; r4 = r[3]; r5 = r[4];
	v10h = ((r1<<23)|(r2>>9)) & 0x7f807f80;
	v10l = (r2>>8) & 0xff00ff;
	v20h = ((r2<<23)|(r3>>9)) & 0x7f807f80;
	v20l = (r3>>8) & 0xff00ff;
	v30h = ((r3<<23)|(r4>>9)) & 0x7f807f80;
	v30l = (r4>>8) & 0xff00ff;
	v40h = ((r4<<23)|(r5>>9)) & 0x7f807f80;
	v40l = (r5>>8) & 0xff00ff;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x1fe01fe) >>1);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x1fe01fe) >>1);
	d += rb/4;
      }
    } /* end case */
    } /* end switch */

#else /* SLOW */
    /*
     *  Alternate implementation for vertical interpolation: do bytewise
     *  addition, but collect the results in 32-bit registers. This is slower
     *  than binary arithmetics, but still faster than classic procedures.
     */
    ref2 = ref + rb;
    for (j=16; j; j--) {
      d[0] = ((((ref[0]+ref2[0]+1)<<23) |
	       ((ref[2]+ref2[2]+1)<<7)) & 0xff00ff00) |
	     (((ref[1]+ref2[1]+1)&0x1fe)<<15) |
	     ((ref[3]+ref2[3]+1)>>1);
      d[1] = ((((ref[4]+ref2[4]+1)<<23) |
	       ((ref[6]+ref2[6]+1)<<7)) & 0xff00ff00) |
	     (((ref[5]+ref2[5]+1)&0x1fe)<<15) |
	     ((ref[7]+ref2[7]+1)>>1);
      d[2] = ((((ref[8]+ref2[8]+1)<<23) |
	       ((ref[10]+ref2[10]+1)<<7)) & 0xff00ff00) |
	     (((ref[9]+ref2[9]+1)&0x1fe)<<15) |
	     ((ref[11]+ref2[11]+1)>>1);
      d[3] = ((((ref[12]+ref2[12]+1)<<23) |
	       ((ref[14]+ref2[14]+1)<<7)) & 0xff00ff00) |
	     (((ref[13]+ref2[13]+1)&0x1fe)<<15) |
	     ((ref[15]+ref2[15]+1)>>1);
      ref += rb;
      ref2 += rb;
      d += rb/4;
    }
#endif /* SLOW */
#endif /* not CLASSIC */
    break;

    /* ================  INTERPOLATE 16x16 HORIZONTAL ================== */

  case 2:
#ifdef CLASSIC
    for (j=0; j<16; j++) {
      s1 = ref[0];
      dec[0] = (unsigned int)(s1+(s2=ref[1])+1)>>1;
      dec[1] = (unsigned int)(s2+(s1=ref[2])+1)>>1;
      dec[2] = (unsigned int)(s1+(s2=ref[3])+1)>>1;
      dec[3] = (unsigned int)(s2+(s1=ref[4])+1)>>1;
      dec[4] = (unsigned int)(s1+(s2=ref[5])+1)>>1;
      dec[5] = (unsigned int)(s2+(s1=ref[6])+1)>>1;
      dec[6] = (unsigned int)(s1+(s2=ref[7])+1)>>1;
      dec[7] = (unsigned int)(s2+(s1=ref[8])+1)>>1;
      dec[8] = (unsigned int)(s1+(s2=ref[9])+1)>>1;
      dec[9] = (unsigned int)(s2+(s1=ref[10])+1)>>1;
      dec[10] = (unsigned int)(s1+(s2=ref[11])+1)>>1;
      dec[11] = (unsigned int)(s2+(s1=ref[12])+1)>>1;
      dec[12] = (unsigned int)(s1+(s2=ref[13])+1)>>1;
      dec[13] = (unsigned int)(s2+(s1=ref[14])+1)>>1;
      dec[14] = (unsigned int)(s1+(s2=ref[15])+1)>>1;
      dec[15] = (unsigned int)(s2+ref[16]+1)>>1;
 
      ref += rb;
      dec += rb;
    }
#else /* not CLASSIC */
#ifdef SLOW
    /*
     *  For horizontal interpolation binary arithmetic is actually slower
     *  than just 32 bit stores, due to overhead in shifting the operands
     *  twice. Also, the trick of keeping prepared operands between passes
     *  of the loop doesn't work with horizontal interpolation.
     */
    d=(unsigned long*)dec;
    if(((long)ref & 3) == 0) {
      unsigned long v10h,v10l,v11h,v11l;
      r=(unsigned long*)ref;
      for (j=16; j; j--) {
	v10h = (r[0]>>1) & 0x7f807f80;
	v10l = r[0] & 0xff00ff;
	v11h = (r[0]<<7) & 0x7f807f80;
	v11l = ((r[0] & 0xff00)<<8) | (r[1]>>24);
	d[0] = ((v10h + v11h + 0x800080) & 0xff00ff00) |
	       (((v10l + v11l  + 0x10001) & 0x1fe01fe) >>1);
	v10h = (r[1]>>1) & 0x7f807f80;
	v10l = r[1] & 0xff00ff;
	v11h = (r[1]<<7) & 0x7f807f80;
	v11l = ((r[1] & 0xff00)<<8) | (r[2]>>24);
	d[1] = ((v10h + v11h + 0x800080) & 0xff00ff00) |
	       (((v10l + v11l  + 0x10001) & 0x1fe01fe) >>1);
	v10h = (r[2]>>1) & 0x7f807f80;
	v10l = r[2] & 0xff00ff;
	v11h = (r[2]<<7) & 0x7f807f80;
	v11l = ((r[2] & 0xff00)<<8) | (r[3]>>24);
	d[2] = ((v10h + v11h + 0x800080) & 0xff00ff00) |
	       (((v10l + v11l  + 0x10001) & 0x1fe01fe) >>1);
	v10h = (r[3]>>1) & 0x7f807f80;
	v10l = r[3] & 0xff00ff;
	v11h = (r[3]<<7) & 0x7f807f80;
	v11l = ((r[3] & 0xff00)<<8) | *((Byte*)r+16);
	d[3] = ((v10h + v11h + 0x800080) & 0xff00ff00) |
	       (((v10l + v11l  + 0x10001) & 0x1fe01fe) >>1);
	r += rb/4;
	d += rb/4;
      }
    }
    else {
      unsigned long v10h,v10l,v11h,v11l,r0,r1,r2,r3,r4;
      int le,re;

      r=(unsigned long*)((long)ref & ~3);
      le = ((long)ref & 3)<<3;
      re = 32-le;

      for (j=16; j; j--) {
	r0 = r[0]; r1 = r[1]; r2 = r[2]; r3 = r[3]; r4 = r[4];
	r0 = (r0<<le)|(r1>>re);
	r1 = (r1<<le)|(r2>>re);
	r2 = (r2<<le)|(r3>>re);
	r3 = (r3<<le)|(r4>>re);

	v10h = (r0>>1) & 0x7f807f80;
	v10l = r0 & 0xff00ff;
	v11h = (r0<<7) & 0x7f807f80;
	v11l = ((r0 & 0xff00)<<8) | (r1>>24);
	d[0] = ((v10h + v11h + 0x800080) & 0xff00ff00) |
	       (((v10l + v11l  + 0x10001) & 0x1fe01fe) >>1);
	v10h = (r1>>1) & 0x7f807f80;
	v10l = r1 & 0xff00ff;
	v11h = (r1<<7) & 0x7f807f80;
	v11l = ((r1 & 0xff00)<<8) | (r2>>24);
	d[1] = ((v10h + v11h + 0x800080) & 0xff00ff00) |
	       (((v10l + v11l  + 0x10001) & 0x1fe01fe) >>1);
	v10h = (r2>>1) & 0x7f807f80;
	v10l = r2 & 0xff00ff;
	v11h = (r2<<7) & 0x7f807f80;
	v11l = ((r2 & 0xff00)<<8) | (r3>>24);
	d[2] = ((v10h + v11h + 0x800080) & 0xff00ff00) |
	       (((v10l + v11l  + 0x10001) & 0x1fe01fe) >>1);
	v10h = (r3>>1) & 0x7f807f80;
	v10l = r3 & 0xff00ff;
	v11h = (r3<<7) & 0x7f807f80;
	v11l = ((r3 & 0xff00)<<8) | *((Byte*)r+16+((unsigned long)ref&3));
	d[3] = ((v10h + v11h + 0x800080) & 0xff00ff00) |
	       (((v10l + v11l  + 0x10001) & 0x1fe01fe) >>1);
	r += rb/4;
	d += rb/4;
      }
    }
#else /* not SLOW */
    /*
     *  Do horizontal interpolation by adding byte-wise and assembling the
     *  results in 32-bit integer registers.
     */
    d=(unsigned long*)dec;
    for (j=16; j; j--) {
      unsigned int s0,s1,s2,s3,s4;
      /* must not do assignments inside arithmetik like in classic version,
       * since order of assignments is not defined after cg/optimizer!
       */
      s1 = ref[1]+1; s2 = ref[2]; s3 = ref[3]+1; s4 = ref[4];
      /* can combine masking of 2nd and 4th Byte, spares one AND */
      d[0] = ((((ref[0]+s1)<<23) | ((s2+s3)<<7)) & 0xff00ff00) |
             (((s1+s2)&0x01fe)<<15) | ((s3+s4)>>1);
      s1 = ref[5]+1; s2 = ref[6]; s3 = ref[7]+1; s0 = ref[8];
      d[1] = ((((s4+s1)<<23) | ((s2+s3)<<7)) & 0xff00ff00) |
             (((s1+s2)&0x01fe)<<15) | ((s3+s0)>>1);
      s1 = ref[9]+1; s2 = ref[10]; s3 = ref[11]+1; s4 = ref[12];
      d[2] = ((((s0+s1)<<23) | ((s2+s3)<<7)) & 0xff00ff00) |
             (((s1+s2)&0x01fe)<<15) | ((s3+s4)>>1);
      s1 = ref[13]+1; s2 = ref[14]; s3 = ref[15]+1; s0 = ref[16];
      d[3] = ((((s4+s1)<<23) | ((s2+s3)<<7)) & 0xff00ff00) |
             (((s1+s2)&0x01fe)<<15) | ((s3+s0)>>1);
      ref += rb;
      d += rb/4;
    }
#endif /* SLOW */
#endif
    break;

    /* ============  INTERPOLATE 16x16 HORIZONTAL & VERTICAL ============= */

  case 3:
#ifdef CLASSIC
    ref2 = ref + rb;
    for (j=0; j<16; j++) {
      s1 = ref[0]; s3 = ref2[0];
      dec[0] = (s1+(s2=ref[1])+s3+(s4=ref2[1])+2)>>2;
      dec[1] = (s2+(s1=ref[2])+s4+(s3=ref2[2])+2)>>2;
      dec[2] = (s1+(s2=ref[3])+s3+(s4=ref2[3])+2)>>2;
      dec[3] = (s2+(s1=ref[4])+s4+(s3=ref2[4])+2)>>2;
      dec[4] = (s1+(s2=ref[5])+s3+(s4=ref2[5])+2)>>2;
      dec[5] = (s2+(s1=ref[6])+s4+(s3=ref2[6])+2)>>2;
      dec[6] = (s1+(s2=ref[7])+s3+(s4=ref2[7])+2)>>2;
      dec[7] = (s2+(s1=ref[8])+s4+(s3=ref2[8])+2)>>2;
      dec[8] = (s1+(s2=ref[9])+s3+(s4=ref2[9])+2)>>2;
      dec[9] = (s2+(s1=ref[10])+s4+(s3=ref2[10])+2)>>2;
      dec[10] = (s1+(s2=ref[11])+s3+(s4=ref2[11])+2)>>2;
      dec[11] = (s2+(s1=ref[12])+s4+(s3=ref2[12])+2)>>2;
      dec[12] = (s1+(s2=ref[13])+s3+(s4=ref2[13])+2)>>2;
      dec[13] = (s2+(s1=ref[14])+s4+(s3=ref2[14])+2)>>2;
      dec[14] = (s1+(s2=ref[15])+s3+(s4=ref2[15])+2)>>2;
      dec[15] = (s2+ref[16]+s4+ref2[16]+2)>>2;

      ref += rb;
      ref2 += rb;
      dec += rb;
    }
#else /* not CLASSIC */
    /*
     *  Use binary arithmetics to do horizontal and vertical interpolation.
     *  Everything said for vertical interpolation applies here too.
     *  With misalignment (esp. 1 and 3), some Shift/OR often can be spared,
     *  when all bytes needed to assemble an 2-byte operand fall into one
     *  32-bit loaded value only.
     */
    d=(unsigned long*)dec;
    switch((long)ref & 3) {
    case 0:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l,
		    v30h,v30l,v31h,v31l, v40h,v40l,v41h,v41l;
      r=(unsigned long*)ref;
      v10h = ((r[0]>>2) & 0x3fc03fc0) + ((r[0]<<6) & 0x3fc03fc0);
      v10l = (r[0] & 0xff00ff) + (((r[0] & 0xff00)<<8) | (r[1]>>24));
      v20h = ((r[1]>>2) & 0x3fc03fc0) + ((r[1]<<6) & 0x3fc03fc0);
      v20l = (r[1] & 0xff00ff) + (((r[1] & 0xff00)<<8) | (r[2]>>24));
      v30h = ((r[2]>>2) & 0x3fc03fc0) + ((r[2]<<6) & 0x3fc03fc0);
      v30l = (r[2] & 0xff00ff) + (((r[2] & 0xff00)<<8) | (r[3]>>24));
      v40h = ((r[3]>>2) & 0x3fc03fc0) + ((r[3]<<6) & 0x3fc03fc0);
      v40l = (r[3] & 0xff00ff) + (((r[3] & 0xff00)<<8) | (r[4]>>24));
      r += rb/4;

      for (j=8; j; j--) {
	v11h = ((r[0]>>2) & 0x3fc03fc0) + ((r[0]<<6) & 0x3fc03fc0) + 0x800080;
	v11l = (r[0] & 0xff00ff) + (((r[0] & 0xff00)<<8) | (r[1]>>24)) +0x20002;
	v21h = ((r[1]>>2) & 0x3fc03fc0) + ((r[1]<<6) & 0x3fc03fc0) + 0x800080;
	v21l = (r[1] & 0xff00ff) + (((r[1] & 0xff00)<<8) | (r[2]>>24)) +0x20002;
	v31h = ((r[2]>>2) & 0x3fc03fc0) + ((r[2]<<6) & 0x3fc03fc0) + 0x800080;
	v31l = (r[2] & 0xff00ff) + (((r[2] & 0xff00)<<8) | (r[3]>>24)) +0x20002;
	v41h = ((r[3]>>2) & 0x3fc03fc0) + ((r[3]<<6) & 0x3fc03fc0) + 0x800080;
	v41l = (r[3] & 0xff00ff) + (((r[3] & 0xff00)<<8) | (r[4]>>24)) +0x20002;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x3fc03fc)>>2);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x3fc03fc)>>2);
	d += rb/4;
	v10h = ((r[0]>>2) & 0x3fc03fc0) + ((r[0]<<6) & 0x3fc03fc0);
	v10l = (r[0] & 0xff00ff) + (((r[0] & 0xff00)<<8) | (r[1]>>24));
	v20h = ((r[1]>>2) & 0x3fc03fc0) + ((r[1]<<6) & 0x3fc03fc0);
	v20l = (r[1] & 0xff00ff) + (((r[1] & 0xff00)<<8) | (r[2]>>24));
	v30h = ((r[2]>>2) & 0x3fc03fc0) + ((r[2]<<6) & 0x3fc03fc0);
	v30l = (r[2] & 0xff00ff) + (((r[2] & 0xff00)<<8) | (r[3]>>24));
	v40h = ((r[3]>>2) & 0x3fc03fc0) + ((r[3]<<6) & 0x3fc03fc0);
	v40l = (r[3] & 0xff00ff) + (((r[3] & 0xff00)<<8) | (r[4]>>24));
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x3fc03fc)>>2);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x3fc03fc)>>2);
	d += rb/4;
      }
    } break;
    case 1: {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l,
		    v30h,v30l,v31h,v31l, v40h,v40l,v41h,v41l;
      r=(unsigned long*)((long)ref & ~3);
      v10h = ((r[0]<<6) & 0x3fc03fc0) +
             ((((r[0] & 0xff00)<<14) | (r[1]>>18)) & 0x3fc03fc0);
      v10l = (((r[0] & 0xff00)<<8)|r[1]>>24) +
             (((r[0]& 0xff)<<16)|((r[1]>>16)&0xff));
      v20h = ((r[1]<<6) & 0x3fc03fc0) +
             ((((r[1] & 0xff00)<<14) | (r[2]>>18)) & 0x3fc03fc0);
      v20l = (((r[1] & 0xff00)<<8)|r[2]>>24) +
             (((r[1]& 0xff)<<16)|((r[2]>>16)&0xff));
      v30h = ((r[2]<<6) & 0x3fc03fc0) +
             ((((r[2] & 0xff00)<<14) | (r[3]>>18)) & 0x3fc03fc0);
      v30l = (((r[2] & 0xff00)<<8)|r[3]>>24) +
             (((r[2]& 0xff)<<16)|((r[3]>>16)&0xff));
      v40h = ((r[3]<<6) & 0x3fc03fc0) +
             ((((r[3] & 0xff00)<<14) | (r[4]>>18)) & 0x3fc03fc0);
      v40l = (((r[3] & 0xff00)<<8)|r[4]>>24) +
             (((r[3]& 0xff)<<16)|((r[4]>>16)&0xff));
      r += rb/4;

      for (j=8; j; j--) {
	v11h = ((r[0]<<6) & 0x3fc03fc0) +
	       ((((r[0] & 0xff00)<<14) | (r[1]>>18)) & 0x3fc03fc0) + 0x800080;
	v11l = (((r[0] & 0xff00)<<8)|r[1]>>24) +
	       (((r[0]& 0xff)<<16)|((r[1]>>16)&0xff)) + 0x20002;
	v21h = ((r[1]<<6) & 0x3fc03fc0) +
	       ((((r[1] & 0xff00)<<14) | (r[2]>>18)) & 0x3fc03fc0) + 0x800080;
	v21l = (((r[1] & 0xff00)<<8)|r[2]>>24) +
	       (((r[1]& 0xff)<<16)|((r[2]>>16)&0xff)) + 0x20002;
	v31h = ((r[2]<<6) & 0x3fc03fc0) +
	       ((((r[2] & 0xff00)<<14) | (r[3]>>18)) & 0x3fc03fc0) + 0x800080;
	v31l = (((r[2] & 0xff00)<<8)|r[3]>>24) +
	       (((r[2]& 0xff)<<16)|((r[3]>>16)&0xff)) + 0x20002;
	v41h = ((r[3]<<6) & 0x3fc03fc0) +
	       ((((r[3] & 0xff00)<<14) | (r[4]>>18)) & 0x3fc03fc0) + 0x800080;
	v41l = (((r[3] & 0xff00)<<8)|r[4]>>24) +
	       (((r[3]& 0xff)<<16)|((r[4]>>16)&0xff)) + 0x20002;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x3fc03fc)>>2);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x3fc03fc)>>2);
	d += rb/4;
	v10h = ((r[0]<<6) & 0x3fc03fc0) +
	       ((((r[0] & 0xff00)<<14) | (r[1]>>18)) & 0x3fc03fc0);
	v10l = (((r[0] & 0xff00)<<8)|r[1]>>24) +
	       (((r[0]& 0xff)<<16)|((r[1]>>16)&0xff));
	v20h = ((r[1]<<6) & 0x3fc03fc0) +
	       ((((r[1] & 0xff00)<<14) | (r[2]>>18)) & 0x3fc03fc0);
	v20l = (((r[1] & 0xff00)<<8)|r[2]>>24) +
	       (((r[1]& 0xff)<<16)|((r[2]>>16)&0xff));
	v30h = ((r[2]<<6) & 0x3fc03fc0) +
	       ((((r[2] & 0xff00)<<14) | (r[3]>>18)) & 0x3fc03fc0);
	v30l = (((r[2] & 0xff00)<<8)|r[3]>>24) +
	       (((r[2]& 0xff)<<16)|((r[3]>>16)&0xff));
	v40h = ((r[3]<<6) & 0x3fc03fc0) +
	       ((((r[3] & 0xff00)<<14) | (r[4]>>18)) & 0x3fc03fc0);
	v40l = (((r[3] & 0xff00)<<8)|r[4]>>24) +
	       (((r[3]& 0xff)<<16)|((r[4]>>16)&0xff));
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x3fc03fc)>>2);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x3fc03fc)>>2);
	d += rb/4;
      }
    } break;
    case 2:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l,
		    v30h,v30l,v31h,v31l, v40h,v40l,v41h,v41l;
      r=(unsigned long*)((long)ref & ~3);
      v10h = (((r[0] & 0xff00)<<14) | ((r[1]>>18) & 0x3fc0)) +
             (((r[0] & 0xff)<<22) | ((r[1]>>10) & 0x3fc0));
      v10l = (((r[0] & 0xff)<<16)|((r[1]>>16)&0xff)) + ((r[1]>>8) & 0xff00ff);
      v20h = (((r[1] & 0xff00)<<14) | ((r[2]>>18) & 0x3fc0)) +
             (((r[1] & 0xff)<<22) | ((r[2]>>10) & 0x3fc0));
      v20l = (((r[1] & 0xff)<<16)|((r[2]>>16)&0xff)) + ((r[2]>>8) & 0xff00ff);
      v30h = (((r[2] & 0xff00)<<14) | ((r[3]>>18) & 0x3fc0)) +
             (((r[2] & 0xff)<<22) | ((r[3]>>10) & 0x3fc0));
      v30l = (((r[2] & 0xff)<<16)|((r[3]>>16)&0xff)) + ((r[3]>>8) & 0xff00ff);
      v40h = (((r[3] & 0xff00)<<14) | ((r[4]>>18) & 0x3fc0)) +
             (((r[3] & 0xff)<<22) | ((r[4]>>10) & 0x3fc0));
      v40l = (((r[3] & 0xff)<<16)|((r[4]>>16)&0xff)) + ((r[4]>>8) & 0xff00ff);
      r += rb/4;

      for (j=8; j; j--) {
	v11h = (((r[0] & 0xff00)<<14) | ((r[1]>>18) & 0x3fc0)) +
	       (((r[0] & 0xff)<<22) | ((r[1]>>10) & 0x3fc0)) + 0x800080;
	v11l = (((r[0] & 0xff)<<16) | ((r[1]>>16)&0xff)) +
	       ((r[1]>>8) & 0xff00ff) + 0x20002;
	v21h = (((r[1] & 0xff00)<<14) | ((r[2]>>18) & 0x3fc0)) +
	       (((r[1] & 0xff)<<22) | ((r[2]>>10) & 0x3fc0)) + 0x800080;
	v21l = (((r[1] & 0xff)<<16) | ((r[2]>>16)&0xff)) +
	       ((r[2]>>8) & 0xff00ff) + 0x20002;
	v31h = (((r[2] & 0xff00)<<14) | ((r[3]>>18) & 0x3fc0)) +
	       (((r[2] & 0xff)<<22) | ((r[3]>>10) & 0x3fc0)) + 0x800080;
	v31l = (((r[2] & 0xff)<<16) | ((r[3]>>16)&0xff)) +
	       ((r[3]>>8) & 0xff00ff) + 0x20002;
	v41h = (((r[3] & 0xff00)<<14) | ((r[4]>>18) & 0x3fc0)) +
	       (((r[3] & 0xff)<<22) | ((r[4]>>10) & 0x3fc0)) + 0x800080;
	v41l = (((r[3] & 0xff)<<16) | ((r[4]>>16)&0xff)) +
	       ((r[4]>>8) & 0xff00ff) + 0x20002;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x3fc03fc)>>2);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x3fc03fc)>>2);
	d += rb/4;
	v10h = (((r[0] & 0xff00)<<14) | ((r[1]>>18) & 0x3fc0)) +
	       (((r[0] & 0xff)<<22) | ((r[1]>>10) & 0x3fc0));
	v10l = (((r[0] & 0xff)<<16)|((r[1]>>16)&0xff)) + ((r[1]>>8) & 0xff00ff);
	v20h = (((r[1] & 0xff00)<<14) | ((r[2]>>18) & 0x3fc0)) +
	       (((r[1] & 0xff)<<22) | ((r[2]>>10) & 0x3fc0));
	v20l = (((r[1] & 0xff)<<16)|((r[2]>>16)&0xff)) + ((r[2]>>8) & 0xff00ff);
	v30h = (((r[2] & 0xff00)<<14) | ((r[3]>>18) & 0x3fc0)) +
	       (((r[2] & 0xff)<<22) | ((r[3]>>10) & 0x3fc0));
	v30l = (((r[2] & 0xff)<<16)|((r[3]>>16)&0xff)) + ((r[3]>>8) & 0xff00ff);
	v40h = (((r[3] & 0xff00)<<14) | ((r[4]>>18) & 0x3fc0)) +
	       (((r[3] & 0xff)<<22) | ((r[4]>>10) & 0x3fc0));
	v40l = (((r[3] & 0xff)<<16)|((r[4]>>16)&0xff)) + ((r[4]>>8) & 0xff00ff);
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x3fc03fc)>>2);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x3fc03fc)>>2);
	d += rb/4;
      }
    } break;
    case 3:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l,
		    v30h,v30l,v31h,v31l, v40h,v40l,v41h,v41l;
      r=(unsigned long*)((long)ref & ~3);
      v10h = (((r[0]<<22)|(r[1]>>10)) & 0x3fc03fc0) + ((r[1]>>2) & 0x3fc03fc0);
      v10l = ((r[1]>>8) & 0xff00ff) + (r[1] & 0xff00ff);
      v20h = (((r[1]<<22)|(r[2]>>10)) & 0x3fc03fc0) + ((r[2]>>2) & 0x3fc03fc0);
      v20l = ((r[2]>>8) & 0xff00ff) + (r[2] & 0xff00ff);
      v30h = (((r[2]<<22)|(r[3]>>10)) & 0x3fc03fc0) + ((r[3]>>2) & 0x3fc03fc0);
      v30l = ((r[3]>>8) & 0xff00ff) + (r[3] & 0xff00ff);
      v40h = (((r[3]<<22)|(r[4]>>10)) & 0x3fc03fc0) + ((r[4]>>2) & 0x3fc03fc0);
      v40l = ((r[4]>>8) & 0xff00ff) + (r[4] & 0xff00ff);
      r += rb/4;

      for (j=8; j; j--) {
	v11h = (((r[0]<<22)|(r[1]>>10)) & 0x3fc03fc0) +
	       ((r[1]>>2) & 0x3fc03fc0) + 0x800080;
	v11l = ((r[1]>>8) & 0xff00ff) + (r[1] & 0xff00ff) + 0x20002;
	v21h = (((r[1]<<22)|(r[2]>>10)) & 0x3fc03fc0) +
	       ((r[2]>>2) & 0x3fc03fc0) + 0x800080;
	v21l = ((r[2]>>8) & 0xff00ff) + (r[2] & 0xff00ff) + 0x20002;
	v31h = (((r[2]<<22)|(r[3]>>10)) & 0x3fc03fc0) +
	       ((r[3]>>2) & 0x3fc03fc0) + 0x800080;
	v31l = ((r[3]>>8) & 0xff00ff) + (r[3] & 0xff00ff) + 0x20002;
	v41h = (((r[3]<<22)|(r[4]>>10)) & 0x3fc03fc0) +
	       ((r[4]>>2) & 0x3fc03fc0) + 0x800080;
	v41l = ((r[4]>>8) & 0xff00ff) + (r[4] & 0xff00ff) + 0x20002;
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x3fc03fc)>>2);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x3fc03fc)>>2);
	d += rb/4;
	v10h = (((r[0]<<22)|(r[1]>>10))& 0x3fc03fc0) + ((r[1]>>2) & 0x3fc03fc0);
	v10l = ((r[1]>>8) & 0xff00ff) + (r[1] & 0xff00ff);
	v20h = (((r[1]<<22)|(r[2]>>10))& 0x3fc03fc0) + ((r[2]>>2) & 0x3fc03fc0);
	v20l = ((r[2]>>8) & 0xff00ff) + (r[2] & 0xff00ff);
	v30h = (((r[2]<<22)|(r[3]>>10))& 0x3fc03fc0) + ((r[3]>>2) & 0x3fc03fc0);
	v30l = ((r[3]>>8) & 0xff00ff) + (r[3] & 0xff00ff);
	v40h = (((r[3]<<22)|(r[4]>>10))& 0x3fc03fc0) + ((r[4]>>2) & 0x3fc03fc0);
	v40l = ((r[4]>>8) & 0xff00ff) + (r[4] & 0xff00ff);
	r += rb/4;
	d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
	d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
	d[2] = ((v30h + v31h) & 0xff00ff00) | (((v30l + v31l) & 0x3fc03fc)>>2);
	d[3] = ((v40h + v41h) & 0xff00ff00) | (((v40l + v41l) & 0x3fc03fc)>>2);
	d += rb/4;
      }
    } /* end case */
    } /* end switch */
#endif
    break;
  }
}

/**
 **  This procedure uses the very same implementation as with 16x16.
 **  The only difference is that only half the amount of bytes have to
 **  be processed both horizontally and vertically.
 **/

void CompensateHalfPelMotion8(Global *this, Byte *dec, Byte *ref, int xh, int yh, int rb) {
  int j;
  double *dd,*dr;
  unsigned long *d,*r;

#ifndef WITHOUT_VIS
  if (this->Options.use_vis)
    {
      CompensateHalfPelMotion8_vis(dec, ref, xh, yh, rb);
      return;
    }
#endif

  switch ((xh<<1)|yh) {

    /* ================  COPY 8x8  ================== */

  case 0:
#ifdef CLASSIC
    for (j=0; j<8; j++) {
      dec[0] = ref[0]; dec[1] = ref[1]; dec[2] = ref[2]; dec[3] = ref[3];
      dec[4] = ref[4]; dec[5] = ref[5]; dec[6] = ref[6]; dec[7] = ref[7];
      ref += rb;
      dec += rb;
    }
#else /* not CLASSIC */
    switch((long)ref & 7) {
    case 0:                                                 /* 64-Bit copies */
      dd=(double*)dec;
      dr=(double*)ref;
      dd[0] = dr[0]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dr += rb/8; dd += rb/8;
      dd[0] = dr[0]; dr += rb/8; dd += rb/8;
      break;
    case 4:                                                 /* 32-Bit copies */
      d=(unsigned long*)dec;
      r=(unsigned long*)ref;
      d[0] = r[0]; d[1] = r[1]; r += rb/4; d += rb/4;
      d[0] = r[0]; d[1] = r[1]; r += rb/4; d += rb/4;
      d[0] = r[0]; d[1] = r[1]; r += rb/4; d += rb/4;
      d[0] = r[0]; d[1] = r[1]; r += rb/4; d += rb/4;
      d[0] = r[0]; d[1] = r[1]; r += rb/4; d += rb/4;
      d[0] = r[0]; d[1] = r[1]; r += rb/4; d += rb/4;
      d[0] = r[0]; d[1] = r[1]; r += rb/4; d += rb/4;
      d[0] = r[0]; d[1] = r[1]; r += rb/4; d += rb/4;
      break;
    case 5:
    case 1:
      d=(unsigned long*)dec, r=(unsigned long*)((long)ref&~3);
      for (j=8; j; j--) {
        unsigned long r1 = r[0], r2 = r[1], r3 = r[2];
        r += rb/4;
        d[0] = (r1<<8)|(r2>>24);
        d[1] = (r2<<8)|(r3>>24);
        d += rb/4;
      }
      break;
    case 6:
    case 2:
      d=(unsigned long*)dec, r=(unsigned long*)((long)ref&~3);
      for (j=8; j; j--) {
        unsigned long r1 = r[0], r2 = r[1], r3 = r[2];
        r += rb/4;
        d[0] = (r1<<16)|(r2>>16);
        d[1] = (r2<<16)|(r3>>16);
        d += rb/4;
      }
      break;
    case 7:
    case 3:
      d=(unsigned long*)dec, r=(unsigned long*)((long)ref&~3);
      for (j=8; j; j--) {
        unsigned long r1 = r[0], r2 = r[1], r3 = r[2];
        r += rb/4;
        d[0] = (r1<<24)|(r2>>8);
        d[1] = (r2<<24)|(r3>>8);
        d += rb/4;
      }
      break;
    }
#endif
    break;

    /* ================  INTERPOLATE 8x8 VERTICAL  ================== */

  case 1:
#ifdef CLASSIC
    ref2 = ref + rb;
    for (j=0; j<8; j++) {
      dec[0] = (unsigned int)(ref[0]+ref2[0]+1)>>1;
      dec[1] = (unsigned int)(ref[1]+ref2[1]+1)>>1;
      dec[2] = (unsigned int)(ref[2]+ref2[2]+1)>>1;
      dec[3] = (unsigned int)(ref[3]+ref2[3]+1)>>1;
      dec[4] = (unsigned int)(ref[4]+ref2[4]+1)>>1;
      dec[5] = (unsigned int)(ref[5]+ref2[5]+1)>>1;
      dec[6] = (unsigned int)(ref[6]+ref2[6]+1)>>1;
      dec[7] = (unsigned int)(ref[7]+ref2[7]+1)>>1;

      ref += rb;
      ref2 += rb;
      dec += rb;
    }
#else /* not CLASSIC */
    d=(unsigned long*)dec;

    switch((long)ref & 3) {
    case 0:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l;
      r=(unsigned long*)ref;
      v10h = (r[0]>>1) & 0x7f807f80;
      v10l = r[0] & 0xff00ff;
      v20h = (r[1]>>1) & 0x7f807f80;
      v20l = r[1] & 0xff00ff;
      r += rb/4;
      for (j=4; j; j--) {
        v11h = ((r[0]>>1) & 0x7f807f80)+ 0x800080;
        v11l = (r[0] & 0xff00ff) + 0x10001;
        v21h = ((r[1]>>1) & 0x7f807f80)+ 0x800080;
        v21l = (r[1] & 0xff00ff) + 0x10001;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
        d += rb/4;
        v10h = (r[0]>>1) & 0x7f807f80;
        v10l = r[0] & 0xff00ff;
        v20h = (r[1]>>1) & 0x7f807f80;
        v20l = r[1] & 0xff00ff;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
        d += rb/4;
      }
    }
    break;
    case 1:
    {
      /*
       *  Misalignment = 1.
       *  This allows to spare the Shift and OR on the upper sum.
       */
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l, r1,r2,r3;
      r=(unsigned long*)((long)ref & ~3);

      r1 = r[0]; r2 = r[1]; r3 = r[2];
      v10h = (r1<<7) & 0x7f807f80;
      v10l = ((r1<<8)|(r2>>24)) & 0xff00ff;
      v20h = (r2<<7) & 0x7f807f80;
      v20l = ((r2<<8)|(r3>>24)) & 0xff00ff;
      r += rb/4;
      for (j=4; j; j--) {
        r1 = r[0]; r2 = r[1]; r3 = r[2];
        v11h = ((r1<<7) & 0x7f807f80) + 0x800080;
        v11l = (((r1<<8)|(r2>>24)) & 0xff00ff) + 0x10001;
        v21h = ((r2<<7) & 0x7f807f80) + 0x800080;
        v21l = (((r2<<8)|(r3>>24)) & 0xff00ff) + 0x10001;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
        d += rb/4;
        r1 = r[0]; r2 = r[1]; r3 = r[2];
        v10h = (r1<<7) & 0x7f807f80;
        v10l = ((r1<<8)|(r2>>24)) & 0xff00ff;
        v20h = (r2<<7) & 0x7f807f80;
        v20l = ((r2<<8)|(r3>>24)) & 0xff00ff;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
        d += rb/4;
      }
    }
    break;
    case 2:
    {
      /*
       *  Misalignment = 2.  Have to Shift and OR every operand.
       */
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l, r1,r2,r3;
      int le=16,re=16;
      r=(unsigned long*)((long)ref & ~3);

      r1 = r[0]; r2 = r[1]; r3 = r[2];
      v10h = ((r1<<15)|(r2>>17)) & 0x7f807f80;
      v10l = ((r1<<16)|(r2>>16)) & 0xff00ff;
      v20h = ((r2<<15)|(r3>>17)) & 0x7f807f80;
      v20l = ((r2<<16)|(r3>>16)) & 0xff00ff;
      r += rb/4;
      for (j=4; j; j--) {
        r1 = r[0]; r2 = r[1]; r3 = r[2];
        v11h = ((((r1<<le)|(r2>>re))>>1) & 0x7f807f80)+ 0x800080;
        v11l = (((r1<<le)|(r2>>re)) & 0xff00ff) + 0x10001;
        v21h = ((((r2<<le)|(r3>>re))>>1) & 0x7f807f80)+ 0x800080;
        v21l = (((r2<<le)|(r3>>re)) & 0xff00ff) + 0x10001;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
        d += rb/4;
        r1 = r[0]; r2 = r[1]; r3 = r[2];
        v10h = (((r1<<le)|(r2>>re))>>1) & 0x7f807f80;
        v10l = ((r1<<le)|(r2>>re)) & 0xff00ff;
        v20h = (((r2<<le)|(r3>>re))>>1) & 0x7f807f80;
        v20l = ((r2<<le)|(r3>>re)) & 0xff00ff;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
        d += rb/4;
      }
    }
    break;
    case 3:
    {
      /*
       *  Spare one Shift and OR on lower sum
       */
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l, r1,r2,r3;
      r=(unsigned long*)((long)ref & ~3);

      r1 = r[0]; r2 = r[1]; r3 = r[2];
      v10h = ((r1<<23)|(r2>>9)) & 0x7f807f80;
      v10l = (r2>>8) & 0xff00ff;
      v20h = ((r2<<23)|(r3>>9)) & 0x7f807f80;
      v20l = (r3>>8) & 0xff00ff;
      r += rb/4;
      for (j=4; j; j--) {
        r1 = r[0]; r2 = r[1]; r3 = r[2];
        v11h = (((r1<<23)|(r2>>9)) & 0x7f807f80) + 0x800080;
        v11l = ((r2>>8) & 0xff00ff) + 0x10001;
        v21h = (((r2<<23)|(r3>>9)) & 0x7f807f80) + 0x800080;
        v21l = ((r3>>8) & 0xff00ff) + 0x10001;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
        d += rb/4;
        r1 = r[0]; r2 = r[1]; r3 = r[2];
        v10h = ((r1<<23)|(r2>>9)) & 0x7f807f80;
        v10l = (r2>>8) & 0xff00ff;
        v20h = ((r2<<23)|(r3>>9)) & 0x7f807f80;
        v20l = (r3>>8) & 0xff00ff;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x1fe01fe) >>1);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x1fe01fe) >>1);
        d += rb/4;
      }
    } /* end case */
    } /* end switch */
#endif /* CLASSIC */
    break;

    /* ================  INTERPOLATE 8x8 HORIZONTAL  ================== */

  case 2:
#ifdef CLASSIC
    for (j=0; j<8; j++) {
      s1 = ref[0];
      dec[0] = (unsigned int)(s1+(s2=ref[1])+1)>>1;
      dec[1] = (unsigned int)(s2+(s1=ref[2])+1)>>1;
      dec[2] = (unsigned int)(s1+(s2=ref[3])+1)>>1;
      dec[3] = (unsigned int)(s2+(s1=ref[4])+1)>>1;
      dec[4] = (unsigned int)(s1+(s2=ref[5])+1)>>1;
      dec[5] = (unsigned int)(s2+(s1=ref[6])+1)>>1;
      dec[6] = (unsigned int)(s1+(s2=ref[7])+1)>>1;
      dec[7] = (unsigned int)(s2+ref[8]+1)>>1;

      ref += rb;
      dec += rb;
    }
#else /* not CLASSIC */
    /* Binary arithmetic would be slower here (see 16x16),
     * so just 32 bit stores are being used.
     */
    d=(unsigned long*)dec;
    for (j=8; j; j--) {
      unsigned int s0,s1,s2,s3,s4;
      /* must not do assignments inside arithmetik like in classic version,
       * since order of assignments is not defined after cg/optimizer!
       */
      s1 = ref[1]+1; s2 = ref[2]; s3 = ref[3]+1; s4 = ref[4];
      /* can combine masking of 2nd and 4th Byte, spares one AND */
      d[0] = ((((ref[0]+s1)<<23) | ((s2+s3)<<7)) & 0xff00ff00) |
             (((s1+s2)&0x01fe)<<15) | ((s3+s4)>>1);
      s1 = ref[5]+1; s2 = ref[6]; s3 = ref[7]+1; s0 = ref[8];
      d[1] = ((((s4+s1)<<23) | ((s2+s3)<<7)) & 0xff00ff00) |
             (((s1+s2)&0x01fe)<<15) | ((s3+s0)>>1);
      ref += rb;
      d += rb/4;
    }
#endif /* CLASSIC */
    break;

    /* ============  INTERPOLATE 8x8 HORIZONTAL & VERTICAL  ============= */

  case 3:
#ifdef CLASSIC
    ref2 = ref + rb;
    for (j=0; j<8; j++) {
      s1 = ref[0]; s3 = ref2[0];
      dec[0] = (unsigned int)(s1+(s2=ref[1])+s3+(s4=ref2[1])+2)>>2;
      dec[1] = (unsigned int)(s2+(s1=ref[2])+s4+(s3=ref2[2])+2)>>2;
      dec[2] = (unsigned int)(s1+(s2=ref[3])+s3+(s4=ref2[3])+2)>>2;
      dec[3] = (unsigned int)(s2+(s1=ref[4])+s4+(s3=ref2[4])+2)>>2;
      dec[4] = (unsigned int)(s1+(s2=ref[5])+s3+(s4=ref2[5])+2)>>2;
      dec[5] = (unsigned int)(s2+(s1=ref[6])+s4+(s3=ref2[6])+2)>>2;
      dec[6] = (unsigned int)(s1+(s2=ref[7])+s3+(s4=ref2[7])+2)>>2;
      dec[7] = (unsigned int)(s2+ref[8]+s4+ref2[8]+2)>>2;

      ref += rb;
      ref2 += rb;
      dec += rb;
    }
#else /* not CLASSIC */
    d=(unsigned long*)dec;
    switch((long)ref & 3) {
    case 0:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l;
      r=(unsigned long*)ref;
      v10h = ((r[0]>>2) & 0x3fc03fc0) + ((r[0]<<6) & 0x3fc03fc0);
      v10l = (r[0] & 0xff00ff) + (((r[0] & 0xff00)<<8) | (r[1]>>24));
      v20h = ((r[1]>>2) & 0x3fc03fc0) + ((r[1]<<6) & 0x3fc03fc0);
      v20l = (r[1] & 0xff00ff) + (((r[1] & 0xff00)<<8) | (r[2]>>24));
      r += rb/4;

      for (j=4; j; j--) {
        v11h = ((r[0]>>2) & 0x3fc03fc0) + ((r[0]<<6) & 0x3fc03fc0) + 0x800080;
        v11l = (r[0] & 0xff00ff) + (((r[0] & 0xff00)<<8) | (r[1]>>24)) +0x20002;
        v21h = ((r[1]>>2) & 0x3fc03fc0) + ((r[1]<<6) & 0x3fc03fc0) + 0x800080;
        v21l = (r[1] & 0xff00ff) + (((r[1] & 0xff00)<<8) | (r[2]>>24)) +0x20002;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
        d += rb/4;
        v10h = ((r[0]>>2) & 0x3fc03fc0) + ((r[0]<<6) & 0x3fc03fc0);
        v10l = (r[0] & 0xff00ff) + (((r[0] & 0xff00)<<8) | (r[1]>>24));
        v20h = ((r[1]>>2) & 0x3fc03fc0) + ((r[1]<<6) & 0x3fc03fc0);
        v20l = (r[1] & 0xff00ff) + (((r[1] & 0xff00)<<8) | (r[2]>>24));
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
        d += rb/4;
      }
    } break;
    case 1:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l;
      r=(unsigned long*)((long)ref & ~3);
      v10h = ((r[0]<<6) & 0x3fc03fc0) +
             ((((r[0] & 0xff00)<<14) | (r[1]>>18)) & 0x3fc03fc0);
      v10l = (((r[0] & 0xff00)<<8)|r[1]>>24) +
             (((r[0]& 0xff)<<16)|((r[1]>>16)&0xff));
      v20h = ((r[1]<<6) & 0x3fc03fc0) +
             ((((r[1] & 0xff00)<<14) | (r[2]>>18)) & 0x3fc03fc0);
      v20l = (((r[1] & 0xff00)<<8)|r[2]>>24) +
             (((r[1]& 0xff)<<16)|((r[2]>>16)&0xff));
      r += rb/4;

      for (j=4; j; j--) {
        v11h = ((r[0]<<6) & 0x3fc03fc0) +
               ((((r[0] & 0xff00)<<14) | (r[1]>>18)) & 0x3fc03fc0) + 0x800080;
        v11l = (((r[0] & 0xff00)<<8)|r[1]>>24) +
               (((r[0]& 0xff)<<16)|((r[1]>>16)&0xff)) + 0x20002;
        v21h = ((r[1]<<6) & 0x3fc03fc0) +
               ((((r[1] & 0xff00)<<14) | (r[2]>>18)) & 0x3fc03fc0) + 0x800080;
        v21l = (((r[1] & 0xff00)<<8)|r[2]>>24) +
               (((r[1]& 0xff)<<16)|((r[2]>>16)&0xff)) + 0x20002;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
        d += rb/4;
        v10h = ((r[0]<<6) & 0x3fc03fc0) +
               ((((r[0] & 0xff00)<<14) | (r[1]>>18)) & 0x3fc03fc0);
        v10l = (((r[0] & 0xff00)<<8)|r[1]>>24) +
               (((r[0]& 0xff)<<16)|((r[1]>>16)&0xff));
        v20h = ((r[1]<<6) & 0x3fc03fc0) +
               ((((r[1] & 0xff00)<<14) | (r[2]>>18)) & 0x3fc03fc0);
        v20l = (((r[1] & 0xff00)<<8)|r[2]>>24) +
               (((r[1]& 0xff)<<16)|((r[2]>>16)&0xff));
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
        d += rb/4;
      }
    } break;
    case 2:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l;
      r=(unsigned long*)((long)ref & ~3);
      v10h = (((r[0] & 0xff00)<<14) | ((r[1]>>18) & 0x3fc0)) +
             (((r[0] & 0xff)<<22) | ((r[1]>>10) & 0x3fc0));
      v10l = (((r[0] & 0xff)<<16)|((r[1]>>16)&0xff)) + ((r[1]>>8) & 0xff00ff);
      v20h = (((r[1] & 0xff00)<<14) | ((r[2]>>18) & 0x3fc0)) +
             (((r[1] & 0xff)<<22) | ((r[2]>>10) & 0x3fc0));
      v20l = (((r[1] & 0xff)<<16)|((r[2]>>16)&0xff)) + ((r[2]>>8) & 0xff00ff);
      r += rb/4;

      for (j=4; j; j--) {
        v11h = (((r[0] & 0xff00)<<14) | ((r[1]>>18) & 0x3fc0)) +
               (((r[0] & 0xff)<<22) | ((r[1]>>10) & 0x3fc0)) + 0x800080;
        v11l = (((r[0] & 0xff)<<16) | ((r[1]>>16)&0xff)) +
               ((r[1]>>8) & 0xff00ff) + 0x20002;
        v21h = (((r[1] & 0xff00)<<14) | ((r[2]>>18) & 0x3fc0)) +
               (((r[1] & 0xff)<<22) | ((r[2]>>10) & 0x3fc0)) + 0x800080;
        v21l = (((r[1] & 0xff)<<16) | ((r[2]>>16)&0xff)) +
               ((r[2]>>8) & 0xff00ff) + 0x20002;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
        d += rb/4;
        v10h = (((r[0] & 0xff00)<<14) | ((r[1]>>18) & 0x3fc0)) +
               (((r[0] & 0xff)<<22) | ((r[1]>>10) & 0x3fc0));
        v10l = (((r[0] & 0xff)<<16)|((r[1]>>16)&0xff)) + ((r[1]>>8) & 0xff00ff);
        v20h = (((r[1] & 0xff00)<<14) | ((r[2]>>18) & 0x3fc0)) +
               (((r[1] & 0xff)<<22) | ((r[2]>>10) & 0x3fc0));
        v20l = (((r[1] & 0xff)<<16)|((r[2]>>16)&0xff)) + ((r[2]>>8) & 0xff00ff);
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
        d += rb/4;
      }
    } break;
    case 3:
    {
      unsigned long v10h,v10l,v11h,v11l, v20h,v20l,v21h,v21l;
      r=(unsigned long*)((long)ref & ~3);
      v10h = (((r[0]<<22)|(r[1]>>10)) & 0x3fc03fc0) + ((r[1]>>2) & 0x3fc03fc0);
      v10l = ((r[1]>>8) & 0xff00ff) + (r[1] & 0xff00ff);
      v20h = (((r[1]<<22)|(r[2]>>10)) & 0x3fc03fc0) + ((r[2]>>2) & 0x3fc03fc0);
      v20l = ((r[2]>>8) & 0xff00ff) + (r[2] & 0xff00ff);
      r += rb/4;

      for (j=4; j; j--) {
        v11h = (((r[0]<<22)|(r[1]>>10)) & 0x3fc03fc0) +
               ((r[1]>>2) & 0x3fc03fc0) + 0x800080;
        v11l = ((r[1]>>8) & 0xff00ff) + (r[1] & 0xff00ff) + 0x20002;
        v21h = (((r[1]<<22)|(r[2]>>10)) & 0x3fc03fc0) +
               ((r[2]>>2) & 0x3fc03fc0) + 0x800080;
        v21l = ((r[2]>>8) & 0xff00ff) + (r[2] & 0xff00ff) + 0x20002;
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
        d += rb/4;
        v10h = (((r[0]<<22)|(r[1]>>10))& 0x3fc03fc0) + ((r[1]>>2) & 0x3fc03fc0);
        v10l = ((r[1]>>8) & 0xff00ff) + (r[1] & 0xff00ff);
        v20h = (((r[1]<<22)|(r[2]>>10))& 0x3fc03fc0) + ((r[2]>>2) & 0x3fc03fc0);
        v20l = ((r[2]>>8) & 0xff00ff) + (r[2] & 0xff00ff);
        r += rb/4;
        d[0] = ((v10h + v11h) & 0xff00ff00) | (((v10l + v11l) & 0x3fc03fc)>>2);
        d[1] = ((v20h + v21h) & 0xff00ff00) | (((v20l + v21l) & 0x3fc03fc)>>2);
        d += rb/4;
      }
    } /* end case */
    } /* end switch */
#endif /* CLASSIC */
    break;
  }
}

/*
 *  Do motion compensation for a motion of (0,0) for Y,U and V blocks
 *  together in one procedure.
 */
#ifdef CLASSIC
void CopyMB16(Global *this, Byte *ydec, Byte *udec, Byte *vdec, 
	      Byte *yref, Byte *uref, Byte *vref) {

  int j;

#ifndef WITHOUT_VIS
  if (this->Options.use_vis)
    {
      CopyMB16_vis(ydec, udec, vdec, yref, uref, vref);
      return;
    }
#endif

  for (j=16; j; j--) {
    ((unsigned long *) ydec)[0] = ((unsigned long *) yref)[0];
    ((unsigned long *) ydec)[1] = ((unsigned long *) yref)[1];
    ((unsigned long *) ydec)[2] = ((unsigned long *) yref)[2];
    ((unsigned long *) ydec)[3] = ((unsigned long *) yref)[3];

    ydec += Yrowbytes;
    yref += Yrowbytes;
  }

  for (j=8; j; j--) {

    ((unsigned long *) udec)[0] = ((unsigned long *) uref)[0];
    ((unsigned long *) udec)[1] = ((unsigned long *) uref)[1];

    ((unsigned long *) vdec)[0] = ((unsigned long *) vref)[0];
    ((unsigned long *) vdec)[1] = ((unsigned long *) vref)[1];

    udec += Crowbytes;
    uref += Crowbytes;
    vdec += Crowbytes;
    vref += Crowbytes;
  }
}
#else /* not CLASSIC */
/*
 *  Since motion is 0, all addresses are 8-Byte aligned.
 *  Can use 64-bit load/stores for maximum throughput. 100% unrolled.
 *  Use all 16 FP-registers to minimize load-use/store-buffer stalls.
 */
void
CopyMB16(Global *this, Byte *ydst, Byte *udst, Byte *vdst, Byte *ysrc, Byte *usrc, Byte *vsrc)
{
  double *ys=(double*)ysrc, *yd=(double*)ydst,
	 *us=(double*)usrc, *ud=(double*)udst,
	 *vs=(double*)vsrc, *vd=(double*)vdst;
  double v1,v2,v3,v4, u1,u2,u3,u4, y11,y12,y21,y22,y31,y32,y41,y42;

#ifndef WITHOUT_VIS
  if (this->Options.use_vis)
    {
      CopyMB16_vis(ydst, udst, vdst, ysrc, usrc, vsrc);
      return;
    }
#endif

  y11 = *ys; y12 = *(ys+1); ys += Yrowbytes/8;
  y21 = *ys; y22 = *(ys+1); ys += Yrowbytes/8;
  y31 = *ys; y32 = *(ys+1); ys += Yrowbytes/8;
  y41 = *ys; y42 = *(ys+1); ys += Yrowbytes/8;
  u1 = *us; us += Crowbytes/8;
  u2 = *us; us += Crowbytes/8;
  u3 = *us; us += Crowbytes/8;
  u4 = *us; us += Crowbytes/8;
  v1 = *vs; vs += Crowbytes/8;
  v2 = *vs; vs += Crowbytes/8;
  v3 = *vs; vs += Crowbytes/8;
  v4 = *vs; vs += Crowbytes/8;
  *yd = y11; *(yd+1) = y12; yd += Yrowbytes/8;
  *yd = y21; *(yd+1) = y22; yd += Yrowbytes/8;
  *yd = y31; *(yd+1) = y32; yd += Yrowbytes/8;
  *yd = y41; *(yd+1) = y42; yd += Yrowbytes/8;
  *ud = u1; ud += Crowbytes/8;
  *ud = u2; ud += Crowbytes/8;
  *ud = u3; ud += Crowbytes/8;
  *ud = u4; ud += Crowbytes/8;
  *vd = v1; vd += Crowbytes/8;
  *vd = v2; vd += Crowbytes/8;
  *vd = v3; vd += Crowbytes/8;
  *vd = v4; vd += Crowbytes/8;

  y11 = *ys; y12 = *(ys+1); ys += Yrowbytes/8;
  y21 = *ys; y22 = *(ys+1); ys += Yrowbytes/8;
  y31 = *ys; y32 = *(ys+1); ys += Yrowbytes/8;
  y41 = *ys; y42 = *(ys+1); ys += Yrowbytes/8;
  u1 = *us; us += Crowbytes/8;
  u2 = *us; us += Crowbytes/8;
  u3 = *us; us += Crowbytes/8;
  u4 = *us; us += Crowbytes/8;
  v1 = *vs; vs += Crowbytes/8;
  v2 = *vs; vs += Crowbytes/8;
  v3 = *vs; vs += Crowbytes/8;
  v4 = *vs; vs += Crowbytes/8;
  *yd = y11; *(yd+1) = y12; yd += Yrowbytes/8;
  *yd = y21; *(yd+1) = y22; yd += Yrowbytes/8;
  *yd = y31; *(yd+1) = y32; yd += Yrowbytes/8;
  *yd = y41; *(yd+1) = y42; yd += Yrowbytes/8;
  *ud = u1; ud += Crowbytes/8;
  *ud = u2; ud += Crowbytes/8;
  *ud = u3; ud += Crowbytes/8;
  *ud = u4; ud += Crowbytes/8;
  *vd = v1; vd += Crowbytes/8;
  *vd = v2; vd += Crowbytes/8;
  *vd = v3; vd += Crowbytes/8;
  *vd = v4; vd += Crowbytes/8;

  y11 = *ys; y12 = *(ys+1); ys += Yrowbytes/8;
  y21 = *ys; y22 = *(ys+1); ys += Yrowbytes/8;
  y31 = *ys; y32 = *(ys+1); ys += Yrowbytes/8;
  y41 = *ys; y42 = *(ys+1); ys += Yrowbytes/8;
  *yd = y11; *(yd+1) = y12; yd += Yrowbytes/8;
  *yd = y21; *(yd+1) = y22; yd += Yrowbytes/8;
  *yd = y31; *(yd+1) = y32; yd += Yrowbytes/8;
  *yd = y41; *(yd+1) = y42; yd += Yrowbytes/8;
  y11 = *ys; y12 = *(ys+1); ys += Yrowbytes/8;
  y21 = *ys; y22 = *(ys+1); ys += Yrowbytes/8;
  y31 = *ys; y32 = *(ys+1); ys += Yrowbytes/8;
  y41 = *ys; y42 = *(ys+1); ys += Yrowbytes/8;
  *yd = y11; *(yd+1) = y12; yd += Yrowbytes/8;
  *yd = y21; *(yd+1) = y22; yd += Yrowbytes/8;
  *yd = y31; *(yd+1) = y32; yd += Yrowbytes/8;
  *yd = y41; *(yd+1) = y42; yd += Yrowbytes/8;
}
#endif /* CLASSIC */

