#include <values.h>
#include "Util.h"
#include "DataTypes.h"
#include "config.h"

#define PREF_16x16NULL_VECT 100
#define PREF_8x8NULL_VECT 25

#ifdef USE_VIS

#include "vis.h"


static int FastBlockSAD (Byte *b1, Byte *b2, int w, int bsize, int Min) {

  int j;
  Byte *b2_a;
  vis_d64 low, high;
  vis_d64 akku1 = vis_fzero();
  vis_d64 akku2 = vis_fzero();

  b2_a = vis_alignaddr(b2,0);

  for (j=0; j<16; j++) {

    low = vis_ld64(b2_a);     high = vis_ld64(b2_a+8);
    akku1 = vis_pdist(vis_ld64(b1), vis_faligndata(low, high), akku1); 

    low = vis_ld64(b2_a+16);
    akku2 = vis_pdist(vis_ld64(b1+8), vis_faligndata(high, low), akku2);

    b1 += w;
    b2_a += w;
  }

  return((int) vis_from_float(vis_fpadd32s( vis_read_lo(akku1), vis_read_lo(akku2) ) ) );
}

#else

static int FastBlockSAD (Byte *b1, Byte *b2, int w, int bsize, int Min) {

  int i, j;
  int sad = 0;

  for (j=0; j<bsize; j++) {
    
    for (i=0; i<bsize; i++) {
      sad += ABS(b1[i] - b2[i]);
    }
    
    if (sad > Min) return (MAXINT);

    b1 += w;
    b2 += w;
  }
  return(sad);
}
#endif

static int FastHPShiftedBlockSAD (Byte *act, Byte *ref, int w, int hpx, int hpy, int bsize, int Min) {

  Byte rb;
  int i, j;
  int sad = 0;

  if ((hpx==0)&&(hpy==0)) {
    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	sad += ABS(act[i] - ref[i]);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ref += w;
    }
  } else if ((hpx!=0)&&(hpy==0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	rb = (ref[i] + ref[i+hpx] + 1) >> 1;
	sad += ABS(act[i] - rb);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ref += w;
    }
  } else if ((hpx==0)&&(hpy!=0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	rb = (ref[i] + ref[i+(hpy*w)] + 1) >> 1;
	sad += ABS(act[i] - rb);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ref += w;
    }
  } else if ((hpx!=0)&&(hpy!=0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	rb = (ref[i] + ref[i+hpx] + ref[i+(hpy*w)] + ref[i+(hpy*w)+hpx] + 2) >> 2;
	sad += ABS(act[i] - rb);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ref += w;
    }
  }
  return(sad);
}

static int FastHPShiftedBlockBandpassSAD (Byte *act, Byte *ups, Byte *ref, Byte *refups, int w, int hpx, int hpy, int bsize, int Min) {

  int rb1, rb2;
  int i, j;
  int diff;
  int sad = 0;

  if ((hpx==0)&&(hpy==0)) {
    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	diff = act[i] - ups[i];
	sad += ABS(diff - ref[i] + refups[i]);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ups += w;
      ref += w; refups += w;
    }
  } else if ((hpx!=0)&&(hpy==0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	diff = act[i] - ups[i];
	rb1 = (ref[i] + ref[i+hpx] + 1) >> 1;
/*	rb2 = (refups[i] + refups[i+hpx] + 1) >> 1; */
	rb2 = refups[i]; 
	sad += ABS(diff - rb1 + rb2);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ups += w;
      ref += w; refups += w;
    }
  } else if ((hpx==0)&&(hpy!=0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	diff = act[i] - ups[i];
	rb1 = (ref[i] + ref[i+(hpy*w)] + 1) >> 1;
/*	rb2 = (refups[i] + refups[i+(hpy*w)] + 1) >> 1;  */
	rb2 = refups[i]; 
	sad += ABS(diff - rb1 + rb2);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ups += w;
      ref += w; refups += w;
    }
  } else if ((hpx!=0)&&(hpy!=0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	diff = act[i] - ups[i];
	rb1 = (ref[i] + ref[i+hpx] + ref[i+(hpy*w)] + ref[i+(hpy*w)+hpx] + 2) >> 2;
/*	rb2 = (refups[i] + refups[i+hpx] + refups[i+(hpy*w)] + refups[i+(hpy*w)+hpx] + 2) >> 2; */
	rb2 = refups[i]; 
	sad += ABS(diff - rb1 + rb2);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ups += w;
      ref += w; refups += w;
    }
  }
  return(sad);
}

static int FastHPShiftedBlockBandpassSAD2 (Byte *act, Byte *ups, Byte *ref, Byte *refups, int w, int hpx, int hpy, int bsize, int Min) {

  int rb1, rb2;
  int i, j;
  int diff;
  int sad = 0;

  if ((hpx==0)&&(hpy==0)) {
    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	diff = act[i] - ups[i];
	sad += ABS(diff - ref[i] + refups[i]);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ups += w;
      ref += w; refups += w;
    }
  } else if ((hpx!=0)&&(hpy==0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	diff = act[i] - ups[i];
	rb1 = (ref[i] + ref[i+hpx] + 1) >> 1;
	rb2 = (refups[i] + refups[i+hpx] + 1) >> 1;
	sad += ABS(diff - rb1 + rb2);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ups += w;
      ref += w; refups += w;
    }
  } else if ((hpx==0)&&(hpy!=0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	diff = act[i] - ups[i];
	rb1 = (ref[i] + ref[i+(hpy*w)] + 1) >> 1;
	rb2 = (refups[i] + refups[i+(hpy*w)] + 1) >> 1; 
	sad += ABS(diff - rb1 + rb2);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ups += w;
      ref += w; refups += w;
    }
  } else if ((hpx!=0)&&(hpy!=0)) {

    for (j=0; j<bsize; j++) {
      for (i=0; i<bsize; i++) {
	diff = act[i] - ups[i];
	rb1 = (ref[i] + ref[i+hpx] + ref[i+(hpy*w)] + ref[i+(hpy*w)+hpx] + 2) >> 2;
	rb2 = (refups[i] + refups[i+hpx] + refups[i+(hpy*w)] + refups[i+(hpy*w)+hpx] + 2) >> 2;
	sad += ABS(diff - rb1 + rb2);
      }
      if (sad > Min)  return (MAXINT);
      act += w; ups += w;
      ref += w; refups += w;
    }
  }
  return(sad);
}
   

static int x_hp[8] = {-1,  0,  1,-1, 1,-1, 0, 1};
static int y_hp[8] = {-1, -1, -1, 0, 0, 1, 1, 1};

int HPRefine (Byte *act, Byte *ref, int w, int h, int x, int y, int bsize, int mvx, int mvy, int *hpx, int *hpy, int *sad0) {

  Byte *act_blk, *ref_blk;
  int left, right, top, bott;
  int xh, yh, best_hx, best_hy;
  int err, min_err;
  int n;

  /* Position of actual block */
  act_blk = act + (y*w) + x;

  /* Position of reference block (shifted by [mvx, mvy]) */
  ref_blk = ref + ((y + mvy)*w) + x + mvx;

  switch (bsize) {
  case 8:
    min_err = FastHPShiftedBlockSAD(act_blk, ref_blk, w, 0, 0, bsize, MAXINT); /*  - PREF_8x8NULL_VECT; */
    break;
  case 16:
    min_err = FastHPShiftedBlockSAD(act_blk, ref_blk, w, 0, 0, bsize, MAXINT); /* - PREF_16x16NULL_VECT; */
    break;
  }

  best_hx = 0; best_hy = 0; *sad0 = min_err;

  left = x + mvx; right = x + mvx + bsize;
  top  = y + mvy; bott = y + mvy + bsize;

  for (n=0; n<8; n++) {

    xh = x_hp[n]; yh = y_hp[n];

    if ( (left+xh>=0) && (right+xh<=w) && ((top+yh)>=0) && ((bott+yh)<=h) ) {
      err = FastHPShiftedBlockSAD(act_blk, ref_blk, w, xh, yh, bsize, min_err);

      if ( err<min_err ) {
	min_err = err;
	best_hx = xh  ; best_hy = yh;
      }
    }
  }
  *hpx = best_hx; *hpy = best_hy;
  return(min_err);
}

int HPBandpassRefine (Byte *act, Byte *ups, Byte *ref, Byte *refups, int w, int h, int x, int y, int bsize, int mvx, int mvy, int *hpx, int *hpy) {

  Byte *act_blk, *ups_blk, *ref_blk, *refups_blk;
  int left, right, top, bott;
  int xh, yh, best_hx, best_hy;
  int err, min_err;
  int n;

  act_blk = act + (y*w) + x;
  ups_blk = ups + (y*w) + x;

  refups_blk = refups + (y*w) + x; 
/*  refups_blk = refups + ((y + mvy)*w) + x + mvx; */
  ref_blk = ref + ((y + mvy)*w) + x + mvx;

  switch (bsize) {
  case 8:
    min_err = FastHPShiftedBlockBandpassSAD(act_blk, ups_blk, ref_blk, refups_blk, w, 0, 0, bsize, MAXINT); /* - PREF_8x8NULL_VECT; */
    break;
  case 16:
    min_err = FastHPShiftedBlockBandpassSAD(act_blk, ups_blk, ref_blk, refups_blk, w, 0, 0, bsize, MAXINT); /*  - PREF_16x16NULL_VECT; */
    break;
  }

  best_hx = 0; best_hy = 0;
  left = x + mvx; right = x + mvx + bsize;
  top  = y + mvy; bott = y + mvy + bsize;
  
  for (n=0; n<8; n++) {

    xh = x_hp[n]; yh = y_hp[n];

    if ( (left+xh>=0) && (right+xh<=w) && ((top+yh)>=0) && ((bott+yh)<=h) ) {
      err = FastHPShiftedBlockBandpassSAD(act_blk, ups_blk, ref_blk, refups_blk, w, xh, yh, bsize, min_err);

      if ( err<min_err ) {
	min_err = err;
	best_hx = xh  ; best_hy = yh;
      }
    }
  }
  *hpx = best_hx; *hpy = best_hy;
  return(min_err);
}

int HPBandpassRefine2 (Byte *act, Byte *ups, Byte *ref, Byte *refups, int w, int h, int x, int y, int bsize, int mvx, int mvy, int *hpx, int *hpy, int pref) {

  Byte *act_blk, *ups_blk, *ref_blk, *refups_blk;
  int left, right, top, bott;
  int xh, yh, best_hx, best_hy;
  int err, min_err;
  int n;

  act_blk = act + (y*w) + x;
  ups_blk = ups + (y*w) + x;

  refups_blk = refups + ((y + mvy)*w) + x + mvx; 
  ref_blk = ref + ((y + mvy)*w) + x + mvx;

  switch (bsize) {
  case 8:
    min_err = FastHPShiftedBlockBandpassSAD2(act_blk, ups_blk, ref_blk, refups_blk, w, 0, 0, bsize, MAXINT) - pref; /* PREF_8x8NULL_VECT; */
    break;
  case 16:
    min_err = FastHPShiftedBlockBandpassSAD2(act_blk, ups_blk, ref_blk, refups_blk, w, 0, 0, bsize, MAXINT) - pref; /* PREF_16x16NULL_VECT; */
    break;
  }

  best_hx = 0; best_hy = 0;
  left = x + mvx; right = x + mvx + bsize;
  top  = y + mvy; bott = y + mvy + bsize;

  for (n=0; n<8; n++) {

    xh = x_hp[n]; yh = y_hp[n];

    if ( ((x+xh)>=0) && ((x+bsize+xh)<=w) && ((y+yh)>=0) && ((y+bsize+yh)<=h) ) {
      err = FastHPShiftedBlockBandpassSAD2(act_blk, ups_blk, ref_blk, refups_blk, w, xh, yh, bsize, min_err);

      if ( err<min_err ) {
	min_err = err;
	best_hx = xh  ; best_hy = yh;
      }
    }
  }
  *hpx = best_hx; *hpy = best_hy;
  return(min_err);
}


int FPEstimationSlow (Byte *act, Byte *ref, int w, int h, int x, int y, int bsize, int range, int *mvx, int *mvy, int *sad0) {

  int k, l;
  int i, j;
  int lef, rig, top, bot;
  int err, min_err;
  Byte *act_blk;


  lef = ((x - range) < 0) ? 0 : (x - range);
  rig = ((x + range) < (w - bsize)) ? (x + range) : (w - bsize);

  top = ((y - range) < 0) ? 0 : (y - range);
  bot = ((y + range) < (h - bsize)) ? (y + range) : (h - bsize);

  act_blk = act + (y*w) + x;

  switch (bsize) {
  case 8:
    min_err = FastBlockSAD(act_blk, ref+(y*w)+x, w, bsize, MAXINT); /* - PREF_8x8NULL_VECT; */
    break;
  case 16:
    min_err = FastBlockSAD(act_blk, ref+(y*w)+x, w, bsize, MAXINT); /* - PREF_16x16NULL_VECT; */
    break;
  }

  *mvx = *mvy = 0; *sad0 = min_err;

  for (l=1; l<=range; l++) {

    i = x - l;
    j = y - l;
    
    for (k=0; k < 8*l; k++) {
      if ( i>=lef && i<=rig && j>=top && j<=bot ) {
	err = FastBlockSAD(act_blk, ref+(j*w)+i, w, bsize, min_err);
	if ( err < min_err ) {
	  *mvx = i - x; *mvy = j - y;
	  min_err = err;
	}
      }
      if      (k<2*l) { i++; }
      else if (k<4*l) { j++; }
      else if (k<6*l) { i--; }
      else            { j--; }
    }
  }
  return(min_err);
}

int FPRefineEstimationSlow (Byte *act, Byte *ref, int w, int h, int x, int y, int bsize, int range, int x0, int y0, int *mvx, int *mvy, int *sad0) {

  int k, l;
  int i, j;
  int lef, rig, top, bot;
  int err, min_err;
  Byte *act_blk;


  lef = (((x+x0) - range) < 0) ? 0 : ((x+x0) - range);
  rig = (((x+x0) + range) < (w - bsize)) ? ((x+x0) + range) : (w - bsize);

  top = (((y+y0) - range) < 0) ? 0 : ((y+y0) - range);
  bot = (((y+y0) + range) < (h - bsize)) ? ((y+y0) + range) : (h - bsize);

  act_blk = act + (y*w) + x;

  switch (bsize) {
  case 8:
    min_err = FastBlockSAD(act_blk, ref+((y+y0)*w)+x+x0, w, bsize, MAXINT); /* - PREF_8x8NULL_VECT;  */
    break;
  case 16:
    min_err = FastBlockSAD(act_blk, ref+((y+y0)*w)+x+x0, w, bsize, MAXINT); /* - PREF_16x16NULL_VECT; */
    break;
  }
  *mvx = x0; *mvy = y0; *sad0 = min_err;

  for (l=1; l<=range; l++) {

    i = -l + x0 + x;
    j = -l + y0 + y;
    
    for (k=0; k < 8*l; k++) {
      if ( i>=lef && i<=rig && j>=top && j<=bot ) {
	err = FastBlockSAD(act_blk, ref+(j*w)+i, w, bsize, min_err);
	if ( err < min_err ) {
	  *mvx = i - x; *mvy = j - y;
	  min_err = err;
	}
      }
      if      (k<2*l) { i++; }
      else if (k<4*l) { j++; }
      else if (k<6*l) { i--; }
      else            { j--; }
    }
  }
  return(min_err);
}
      
