/*
 *  yuv.c
 *
 *  $Id: yuv.c,v 1.5 1998/07/16 10:17:39 stuhl Exp $
 *
 *  This file contains all procedures for colorspace conversion and display.
 *
 *    yuv2rgb_naive	- colorspace conversion from YUV to 24-bit aRGB
 *    yuv2rgb_table	- same with tables to speed up range clipping and mult.
 *    yuv2rgb_table_dga	- same, but use DGA to access framebuffer directly
 *    yuv2rgb_vis	- colorspace conversion in VIS
 *    yuv4rgb		- combine YUV conversion and double-sized display
 *    yuv4rgb_vis	- same in VIS
 *    expand_gray    	- for GrayScale mode with double-sized display
 *    expand_gray_vis	- same in VIS
 *    yuv2rgb		- shell to call the different routines, depending on
 *                        mode and compile-time configuration parameters.
 *
 *  For various configuration options see config.h
 */

#define INTPOL_WEIGHTED   /* switch between 3/4 and 1/2 upscale filters 
			   * Note that 1 3 3 1 gives much more pleasant results, 
			   * so whenever possible leave this flag set on
			   */

#undef BUGGY             /* do not use compact yuv4rgb procedure
                          * until bug in C-compiler is fixed */
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>

#include "defs.h"
#include "structs.h"

#include "dispconf.h"

#include "options.h"
#include "display.h"
#include "display.p"

#include "vis.h"

#ifndef WITHOUT_DGA
#include <dga/dga.h>
#include <dga/dga_externaldefs.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/fbio.h>

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/cg14io.h>

extern Dga_drawable dga_id;
extern unsigned long *dga_map;
extern int dga_fb_width;
extern int dga_fb_height;
#endif /* !WITHOUT_DGA */

#ifndef WITHOUT_THREADS
#include <thread.h>
#include <sched.h>
#include <synch.h>
#endif /* !WITHOUT_THREADS */

extern unsigned char *rgb_image; /* defined in display.c */
extern int visdepth;

/* ------------------------------------------------------------------------- */
/*
 *  naive: compute YUV -> xRGB conversion on the fly
 *
 *  Very similar to yuv2rgb_table, but does multiplications and range clipping
 *  discretely. This is very slow - use for performance comparison only.
 *  See init_yuv2rgb for conversion formula.
 */
#ifdef NAIVE

static void yuv2rgb_naive(Picture *p, unsigned long *dst)
{
  int r1,g1,b1,r2,g2,b2;
  int y1c,y2c,y1n,y2n,u,v,ub,vr,uvg;
  int i,j,yw;
  Byte *py,*pu,*pv;

  yw = p->w;
  pu = p->u;
  pv = p->v;
  py = p->y;

  for (j = p->h/2; j; j--)
  {
    for (i = yw/2; i; i--)
    {
      y1c = *py;
      y2c = *(py+1);
      y1n = *(py+yw);
      y2n = *(py+yw+1);

      /*  U and V addends can be used for 4 Y pixels due to 1:2 subsampling
       */
      u = *pu - 128;
      v = *pv - 128;
      vr = (22970 * v + 8192) >> 14;
      uvg = (-5638 * u + -11700 * v + 8192) >> 14;
      ub = (29032 * u + 8192) >> 14;

      r1 = y1c + vr;
      g1 = y1c + uvg;
      b1 = y1c + ub;
      r2 = y2c + vr;
      g2 = y2c + uvg;
      b2 = y2c + ub;
#ifndef HALF_NAIVE
      /*
       *  range clipping ensures that all computed values stay inside 0..255
       */
      if(r1<0) {r1=0;} else if(r1>255) {r1=255;}
      if(g1<0) {g1=0;} else if(g1>255) {g1=255;}
      if(b1<0) {b1=0;} else if(b1>255) {b1=255;}
      if(r2<0) {r2=0;} else if(r2>255) {r2=255;}
      if(g2<0) {g2=0;} else if(g2>255) {g2=255;}
      if(b2<0) {b2=0;} else if(b2>255) {b2=255;}
#else
      /*
       *  half-naive acclerates greatly by use of rang-clipping tables.
       */
      r1 = range_clip[r1+256];
      g1 = range_clip[g1+256];
      b1 = range_clip[b1+256];
      r2 = range_clip[r2+256];
      g2 = range_clip[g2+256];
      b2 = range_clip[b2+256];
#endif
      /*  Store xRGB as 32 bit, where x is constant 0 (ignored by X11)
       */
      *dst = (b1<<16)|(g1<<8)|r1;
      *(dst+1) = (b2<<16)|(g2<<8)|r2;

      r1 = y1n + vr;
      g1 = y1n + uvg;
      b1 = y1n + ub;
      r2 = y2n + vr;
      g2 = y2n + uvg;
      b2 = y2n + ub;
#ifndef HALF_NAIVE
      if(r1<0) {r1=0;} else if(r1>255) {r1=255;}
      if(g1<0) {g1=0;} else if(g1>255) {g1=255;}
      if(b1<0) {b1=0;} else if(b1>255) {b1=255;}
      if(r2<0) {r2=0;} else if(r2>255) {r2=255;}
      if(g2<0) {g2=0;} else if(g2>255) {g2=255;}
      if(b2<0) {b2=0;} else if(b2>255) {b2=255;}
#else
      r1 = range_clip[r1+256];
      g1 = range_clip[g1+256];
      b1 = range_clip[b1+256];
      r2 = range_clip[r2+256];
      g2 = range_clip[g2+256];
      b2 = range_clip[b2+256];
#endif
      *(dst+yw) = (b1<<16)|(g1<<8)|r1;
      *(dst+yw+1) = (b2<<16)|(g2<<8)|r2;

      py+=2;
      pv++; pu++;
      dst+=2;
    }
    dst += yw;
    py += yw;
  }
}
#endif /* NAIVE */

/* ------------------------------------------------------------------------- */
/*
 *  Precompute results of YUV matrix conversion,
 *  range clipping and interpolation weights tables.
 *  Called once upon start of program.
 *
 *  YUV (YCbCr) -> RGB-24bit conversion as defined per CCIR 601-1:
 *  (for values scaled between -0.5 and +0.5)
 *
 *    R = Y                + 1.40200 * Cr
 *    G = Y - 0.34414 * Cb - 0.71414 * Cr
 *    B = Y + 1.77200 * Cb
 */
#if 1 /* !defined(USE_VIS) || defined(USE_DGA) */

static short Cr_r_tab[256];
static short Cb_b_tab[256];
static int Cr_g_tab[256];
static int Cb_g_tab[256];
static unsigned char range_clip[256 * 3];
static unsigned short I3Tab[256], I98Tab[256];

void init_yuv2rgb(int use_vis)
{
  int i;

#ifndef WITHOUT_VIS
  if (use_vis)
    return;
#endif

  for (i = 0; i <= 255; i++) {
    Cr_r_tab[i] = ((22970 * (i - 128) + 8192) >> 14) + 256;
    Cb_b_tab[i] = ((29032 * (i - 128) + 8192) >> 14) + 256;
    Cr_g_tab[i] = -5638 * (i - 128);
    Cb_g_tab[i] = -11700 * (i - 128) + 8192 + (256 << 14);
  }
  /*  Range clipping table
   */
  for (i = 0; i < 256; i++) range_clip[i] = 0;
  for (i = 256; i < 256+256; i++) range_clip[i] = i;
  for (i = 256+256; i < 256*3; i++) range_clip[i] = 255;

  /*  Memory-lookup is faster than integer multiplication,
   *  hence precompute results for interpolation weighting.
   */
  for (i=0; i<256; i++) {
    I3Tab[i] = i * 3;
    I98Tab[i] = i * 9 + 8;   /* +8 for rouding */
  }
}
#else /* !WITHOUT_DGA */

void init_yuv2rgb(int use_vis) {}  /* no initialization needed for VIS */

#endif /* !WITHOUT_DGA */

static void yuv2rgb_table_resample(Picture *p, unsigned long *dst)
{
  unsigned int r1,g1,b1,r2,g2,b2;
  unsigned int r1a,g1a,b1a,r2a,g2a,b2a,r1b,g1b,b1b,r2b,g2b,b2b/*,r,g,b*/;
  unsigned char *py,*pu,*pv;
  unsigned int i,j,sj,si;
  unsigned int yc,yn, yw=p->w,yws=p->ws,yhs=p->hs,vr,ub,uvg;
  unsigned int sw_tmp = coded_picture_width / yws,
    sh_tmp = coded_picture_height / yhs;
  unsigned int s = MAX(sw_tmp, sh_tmp);
  unsigned int w0 = MIN(coded_picture_width / s, yws),
    h0 = MIN(coded_picture_height / s, yhs);
  int dwd = s * coded_picture_width, dwp = 2 * yw - w0;
  int dwpc = yw / 2 - (w0 / 2);
  int skip_dst = 2 * dwd - s * w0;


  pu = p->u;
  pv = p->v;
  py = p->y;

  for (j = h0/2; j; j--) {
    for (i = w0/2; i; i--) {
      yc = *py;
      yn = *(py+yw);
      vr = Cr_r_tab[*pv];
      uvg = (Cr_g_tab[*pu] + Cb_g_tab[*pv]) >> 14;
      ub = Cb_b_tab[*pu];

      r1a = r1; g1a = g1; b1a = b1;
      r2a = r2; g2a = g2; b2a = b2;

      r1 = range_clip[yc + vr];
      g1 = range_clip[yc + uvg];
      b1 = range_clip[yc + ub];
      r2 = range_clip[yn + vr];
      g2 = range_clip[yn + uvg];
      b2 = range_clip[yn + ub];
      py++;
      for (si = s; si; si--) {
	for (sj = s; sj; sj--) {
	  *(dst+sj*coded_picture_width) = (long)(b1<<16)|(long)(g1<<8)|r1;
	  *(dst+dwd+sj*coded_picture_width) = (long)(b2<<16)|(long)(g2<<8)|r2;
	}
	dst++;
      }

      r1b = r1; g1b = g1; b1b = b1;
      r2b = r2; g2b = g2; b2b = b2;

      yc = *py;
      yn = *(py+yw);
      r1 = range_clip[yc + vr];
      g1 = range_clip[yc + uvg];
      b1 = range_clip[yc + ub];
      r2 = range_clip[yn + vr];
      g2 = range_clip[yn + uvg];
      b2 = range_clip[yn + ub];
      py++;
      for (si = s; si; si--) {
	for (sj = s; sj; sj--) {
	  *(dst+sj*coded_picture_width) = (long)(b1<<16)|(long)(g1<<8)|r1;
	  *(dst+dwd+sj*coded_picture_width) = (long)(b2<<16)|(long)(g2<<8)|r2;
	}
	dst++;
      }

      pv++; pu++;
    }
    dst += skip_dst;
    py += dwp;
    pu += dwpc;
    pv += dwpc;
  }
}


/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/*
 *  YUV -> xRGB, optimized with table
 *
 *  (closely modeled after jdcolor.c, release 5a of 7-Dec-94,
 *   from the Independent JPEG Group's JPEG software)
 */

static void yuv2rgb_table(Picture *p, unsigned long *dst)
{
  unsigned int r1,g1,b1,r2,g2,b2;
  unsigned char *py,*pu,*pv;
  unsigned int i,j;
  unsigned int yc,yn,yw,vr,ub,uvg;


  yw = p->w;
  pu = p->u;
  pv = p->v;
  py = p->y;

  if (yw == coded_picture_width && p->h == coded_picture_height) {
    for (j=p->h/2; j; j--) {
      for (i=yw/2; i; i--) {
	yc = *py;
	yn = *(py+yw);
	vr = Cr_r_tab[*pv];
	uvg = (Cr_g_tab[*pu] + Cb_g_tab[*pv]) >> 14;
	ub = Cb_b_tab[*pu];

	r1 = range_clip[yc + vr];
	g1 = range_clip[yc + uvg];
	b1 = range_clip[yc + ub];
	r2 = range_clip[yn + vr];
	g2 = range_clip[yn + uvg];
	b2 = range_clip[yn + ub];
	py++;
	*dst = (long)(b1<<16)|(long)(g1<<8)|r1;
	*(dst+yw) = (long)(b2<<16)|(long)(g2<<8)|r2;
	dst++;

	yc = *py;
	yn = *(py+yw);
	r1 = range_clip[yc + vr];
	g1 = range_clip[yc + uvg];
	b1 = range_clip[yc + ub];
	r2 = range_clip[yn + vr];
	g2 = range_clip[yn + uvg];
	b2 = range_clip[yn + ub];
	py++;
	*dst = (long)(b1<<16)|(long)(g1<<8)|r1;
	*(dst+yw) = (long)(b2<<16)|(long)(g2<<8)|r2;
	dst++;

	pv++; pu++;
      }
      dst += yw;
      py += yw;
    }
  } else {
    yuv2rgb_table_resample(p, dst);
  }
}

#ifndef WITHOUT_VIS
/* ------------------------------------------------------------------------- */
/*
 *  YUV -> xRGB colorspace conversion (same as above, but using VIS)
 *
 *  algorithm is same as in yuv2rgb_table
 */

static void yuv2rgb_vis(Picture *p, unsigned long *dst)
{
  vis_f32 *pu,*pv,up,vp;
  vis_d64 *py,y1,y2;
  vis_d64 u,v,ufh,ufl,vfh,vfl,uvfh,uvfl,
	  y1h,y2h,y1l,y2l,
          r1,g1,b1,ag1,br1,agbr1h,agbr1l, r2,g2,b2,ag2,br2,agbr2h,agbr2l,
          r3,g3,b3,ag3,br3,agbr3h,agbr3l, r4,g4,b4,ag4,br4,agbr4h,agbr4l;
  /* constants */
  vis_f32 Crb_facs = vis_to_float(((22970/4)<<16)|(29032/4)),
	  Crbg_facs = vis_to_float(((11700/4)<<16)|(5638/4)),
	  scale = vis_to_float(0x1000),
	  nulls = vis_fzeros();
  vis_d64 poff_r = vis_to_double_quad(128 * (22970>>2) / 256),
          poff_b = vis_to_double_quad(128 * (29032>>2) / 256),
          poff_g = vis_to_double_quad(128 * ((11700+5638)>>2) / 256);
  int ow = p->w*4;
  int yw = p->w/8;
  Byte *out = (Byte*)dst;
  int i,j;

  /* set scale factor for fpack 16.
   * precision in multiplications is 12 after comma, but fmul8x16
   * already scales down by 8. (7 is added in fpack16)
   */
  vis_write_gsr((7-(12-8)) << 3);

  py = (vis_d64 *)p->y;
  pu = (vis_f32 *)p->u;
  pv = (vis_f32 *)p->v;

  up = *pu;
  vp = *pv;
  y1 = *py;
  y2 = *(py+yw);
  pu++; pv++; py++;

  for (j=p->h/2; j; j--)
  {
    for (i=yw; i; i--)
    {
      /* duplicate U and V pixels (due to subsampling) */
      u = vis_fpmerge(up,up);
      v = vis_fpmerge(vp,vp);
      ufh = vis_fmul8x16al_hi(u, Crb_facs);
      vfh = vis_fmul8x16au_hi(v, Crb_facs);
      uvfh = vis_fpadd16(vis_fmul8x16au_hi(v, Crbg_facs),
                         vis_fmul8x16al_hi(u, Crbg_facs));
      /* do subtraction after multiplication to avoid 16-bit muls, which
       * require 2 seperate mul instructions and addition afterwords
       * poff_g is handled later due to range limits (16 bit, 4 right of comma)
       */
      ufh = vis_fpsub16(ufh, poff_b);
      vfh = vis_fpsub16(vfh, poff_r);

      y1h = vis_fmul8x16al_hi(y1,scale);
      r1 = vis_fpadd16(y1h, vfh);
      b1 = vis_fpadd16(y1h, ufh);
      g1 = vis_fpsub16(y1h, uvfh);
      g1 = vis_fpadd16(g1, poff_g);
      /* merge separate r,g,b bands to 32-bit aRGB values (a is zero) */
      ag1 = vis_fpmerge(nulls, vis_fpack16(g1));
      br1 = vis_fpmerge(vis_fpack16(b1), vis_fpack16(r1));
      agbr1h = vis_fpmerge(vis_read_hi(ag1),vis_read_hi(br1));
      agbr1l = vis_fpmerge(vis_read_lo(ag1),vis_read_lo(br1));
      vis_st64(agbr1h, out);
      vis_st64(agbr1l, out+8);

      /* same for pixel in scanline below */
      y2h = vis_fmul8x16al_hi(y2,scale);
      r2 = vis_fpadd16(y2h, vfh);
      b2 = vis_fpadd16(y2h, ufh);
      g2 = vis_fpsub16(y2h, uvfh);
      g2 = vis_fpadd16(g2, poff_g);
      ag2 = vis_fpmerge(nulls, vis_fpack16(g2));
      br2 = vis_fpmerge(vis_fpack16(b2), vis_fpack16(r2));
      agbr2h = vis_fpmerge(vis_read_hi(ag2), vis_read_hi(br2));
      agbr2l = vis_fpmerge(vis_read_lo(ag2), vis_read_lo(br2));
      vis_st64(agbr2h, out+ow);
      vis_st64(agbr2l, out+ow+8);

      y1l = vis_fmul8x16al_lo(y1,scale);
      y2l = vis_fmul8x16al_lo(y2,scale);
      y1 = *py;
      y2 = *(py+yw);

      ufl = vis_fmul8x16al(vis_read_lo(u), Crb_facs);
      vfl = vis_fmul8x16au(vis_read_lo(v), Crb_facs);
      uvfl = vis_fpadd16(vis_fmul8x16au_lo(v, Crbg_facs),
                         vis_fmul8x16al_lo(u, Crbg_facs));
      ufl = vis_fpsub16(ufl, poff_b);
      vfl = vis_fpsub16(vfl, poff_r);
      up = *pu;
      vp = *pv;
      r3 = vis_fpadd16(y1l, vfl);
      b3 = vis_fpadd16(y1l, ufl);
      g3 = vis_fpsub16(y1l, uvfl);
      g3 = vis_fpadd16(g3, poff_g);
      ag3 = vis_fpmerge(nulls, vis_fpack16(g3));
      br3 = vis_fpmerge(vis_fpack16(b3), vis_fpack16(r3));
      r4 = vis_fpadd16(y2l, vfl);
      b4 = vis_fpadd16(y2l, ufl);
      g4 = vis_fpsub16(y2l, uvfl);
      g4 = vis_fpadd16(g4, poff_g);
      ag4 = vis_fpmerge(nulls, vis_fpack16(g4));
      br4 = vis_fpmerge(vis_fpack16(b4), vis_fpack16(r4));
      vis_st64(vis_fpmerge(vis_read_hi(ag3),vis_read_hi(br3)), out+16);
      vis_st64(vis_fpmerge(vis_read_lo(ag3),vis_read_lo(br3)), out+24);
      vis_st64(vis_fpmerge(vis_read_hi(ag4), vis_read_hi(br4)), out+ow+16);
      vis_st64(vis_fpmerge(vis_read_lo(ag4), vis_read_lo(br4)), out+ow+24);

      out += 32;
      pu++; pv++; py++;
    }
    out += ow;
    py += yw;
  }
}

#endif /* !WITHOUT_VIS */

/* ------------------------------------------------------------------------- */
/*
 *  Optimized version with DGA
 *  - write results of YUV conversion to framebuffer directly
 *  - for 24bit XBGR framebuffer only (e.g. cg14,s24/tcx)
 *  - slower than neccessary since all x- and y-offsets are allowed,
 *    i.e. computation might start on uneven address.
 */
#ifndef WITHOUT_DGA

static void yuv2rgb_table_dga(Picture *p, int fbx, int fby,  int xoff, int yoff,  int ww, int wh)
{
  register int r1,g1,b1,r2,g2,b2;
  unsigned char *py,*pu,*pv;
  unsigned int i,j;
  unsigned long *dst;

  if((ww += xoff) >= p->w) ww = (p->w & ~1);
  if((wh += yoff) >= p->h) wh = (p->h & ~1);

  for (j=yoff; j<wh; j++)
  {
    py = &p->y[p->w * j] + xoff;
    pu = &p->u[p->w/2 * (j >> 1)] + (xoff >> 1);
    pv = &p->v[p->w/2 * (j >> 1)] + (xoff >> 1);
    dst = (unsigned long *)((long)dga_map +
                            ((fbx+xoff) << 2) +
			    ((fby+j) * (dga_fb_width << 2)));

    for (i=xoff; i<ww; i+=2)
    {
      r1 = range_clip[*py + Cr_r_tab[*pv]];
      g1 = range_clip[*py + ((Cr_g_tab[*pu] + Cb_g_tab[*pv]) >> 14)];
      b1 = range_clip[*py + Cb_b_tab[*pu]];
      py++;
      r2 = range_clip[*py + Cr_r_tab[*pv]];
      g2 = range_clip[*py + ((Cr_g_tab[*pu] + Cb_g_tab[*pv]) >> 14)];
      b2 = range_clip[*py + Cb_b_tab[*pu]];
      py++;
      pv++; pu++;

      *dst++ = (b1<<16)|(g1<<8)|r1;
      *dst++ = (b2<<16)|(g2<<8)|r2;
    }
  }
}
#endif /* !WITHOUT_DGA */

/* ------------------------------------------------------------------------- */
/*
 *  YUV-Conversion and Display-Expand (size-doubling)
 *
 *  Combine colorspace conversion and display upscale (expand mode).
 *  Upscaling uses bilinear interpolation. INTPOL_WEIGHTED switches
 *  between CPU intensive and simple upscale filters.
 */

#ifdef INTPOL_WEIGHTED
/*
 *  Use [1/4,3/4,3/4,1/4] weights for filtering.
 */
#define INTPOL(pp,w,r11,r12,r21,r22) \
    { int ns1,ns2; \
      ns1 = I3Tab[*((pp)-1)] + I98Tab[*(pp)]; \
      ns2 = I98Tab[*(pp)] + I3Tab[*((pp)+1)]; \
      r11 = (*((pp)-(w)-1) + I3Tab[*((pp)-(w))] + ns1)>>4; \
      r12 = (I3Tab[*((pp)-(w))] + *((pp)-(w)+1) + ns2)>>4; \
      r21 = (ns1 + *((pp)+(w)-1) + I3Tab[*((pp)+(w))])>>4; \
      r22 = (ns2 + I3Tab[*((pp)+(w))] + *((pp)+(w)+1))>>4; \
    }
#else
/*
 *  Use [1/2,1/2] weights for filtering.
 */
#define INTPOL(pp,w,r11,r12,r21,r22) \
    { \
      r11 = *(pp); \
      r12 = (r11 + *((pp)+1) + 1) / 2; \
      r21 = (r11 + *((pp)+(w)) + 1) / 2; \
      r22 = (r11 + *((pp)+1) + *((pp)+(w)) + *((pp)+(w)+1) + 2) / 4; \
    }
#endif
/*
 *  Colorspace conversion as in yuv2rgb_table
 */
#define CONV(dst,y11,y12,y21,y22,u,v,wid) \
    { int vr,uvg,ub, r1,r2,g1,g2,b1,b2; \
      vr = Cr_r_tab[v]; \
      uvg = (Cr_g_tab[u] + Cb_g_tab[v]) >> 14; \
      ub = Cb_b_tab[u]; \
      r1 = range_clip[(y11) + vr]; \
      g1 = range_clip[(y11) + uvg]; \
      b1 = range_clip[(y11) + ub]; \
      r2 = range_clip[(y21) + vr]; \
      g2 = range_clip[(y21) + uvg]; \
      b2 = range_clip[(y21) + ub]; \
      *(dst) = (b1<<16)|(g1<<8)|r1; \
      *((dst)+(wid)) = (b2<<16)|(g2<<8)|r2; \
      r1 = range_clip[(y12) + vr]; \
      g1 = range_clip[(y12) + uvg]; \
      b1 = range_clip[(y12) + ub]; \
      r2 = range_clip[(y22) + vr]; \
      g2 = range_clip[(y22) + uvg]; \
      b2 = range_clip[(y22) + ub]; \
      *(dst+1) = (b1<<16)|(g1<<8)|r1; \
      *((dst)+(wid)+1) = (b2<<16)|(g2<<8)|r2; \
    }

void yuv4rgb (Picture *p, unsigned long *dst)
{
  unsigned char *py,*pu,*pv;
  unsigned int yw, uvw;
  unsigned int u11,u12,u21,u22,v11,v12,v21,v22,y11,y12,y21,y22,y31,y32,y41,y42;
  int i,j;
 
  yw = p->w;
  uvw = p->w/2;
  pu = p->u;
  pv = p->v;
  py = p->y;
 
  for (j=(p->h/2); j; j--) {
    for (i=uvw; i; i--) {
      INTPOL(pu,uvw,u11,u12,u21,u22); pu++;
      INTPOL(pv,uvw,v11,v12,v21,v22); pv++;
 
      INTPOL(py,yw,y11,y12,y21,y22);
      CONV(dst,y11,y12,y21,y22,u11,v11,2*yw);
      INTPOL(py+yw,yw,y31,y32,y41,y42); py++;
      CONV(dst+4*yw,y31,y32,y41,y42,u21,v21,2*yw);
 
      INTPOL(py,yw,y11,y12,y21,y22);
      CONV(dst+2,y11,y12,y21,y22,u12,v12,2*yw);
      INTPOL(py+yw,yw,y31,y32,y41,y42); py++;
      CONV(dst+2+4*yw,y31,y32,y41,y42,u22,v22,2*yw);
 
      dst += 4;
    }
    py += yw;
    dst += 6*yw;
  }
}

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#ifndef WITHOUT_VIS

# ifdef BUGGY

/*
 *  Same as above, but using VIS.
 *  Implemented for simple [1/2,1/2] filter only.
 */

#define PREP_UV(merge,u_in,v_in) \
    { \
      u = merge ((u_in),(u_in)); \
      v = merge ((v_in),(v_in)); \
      ufh = vis_fmul8x16al_hi(u, Crb_facs); \
      vfh = vis_fmul8x16au_hi(v, Crb_facs); \
      uvfh = vis_fpadd16(vis_fmul8x16au_hi(v, Crbg_facs), \
                         vis_fmul8x16al_hi(u, Crbg_facs)); \
      ufh = vis_fpsub16(ufh, poff_b); \
      vfh = vis_fpsub16(vfh, poff_r); \
      ufl = vis_fmul8x16al(vis_read_lo(u), Crb_facs); \
      vfl = vis_fmul8x16au(vis_read_lo(v), Crb_facs); \
      uvfl = vis_fpadd16(vis_fmul8x16au_lo(v, Crbg_facs), \
                         vis_fmul8x16al_lo(u, Crbg_facs)); \
      ufl = vis_fpsub16(ufl, poff_b); \
      vfl = vis_fpsub16(vfl, poff_r); \
    };
 
#define VISCONV(expand,y1_in,y2_in,vf,uf,uvf,dp,dwid) \
    { \
      y = expand (y1_in); \
      r4 = vis_fpadd16(y, (vf)); \
      b4 = vis_fpadd16(y, (uf)); \
      g4 = vis_fpsub16(y, (uvf)); \
      g4 = vis_fpadd16(g4, poff_g); \
      ag = vis_fpmerge(vis_fzeros(), vis_fpack16(g4)); \
      br = vis_fpmerge(vis_fpack16(b4), vis_fpack16(r4)); \
      vis_st64(vis_fpmerge_hi(ag,br), (dp)); \
      vis_st64(vis_fpmerge_lo(ag,br), (dp)+8); \
      y = expand (y2_in); \
      r4 = vis_fpadd16(y, (vf)); \
      b4 = vis_fpadd16(y, (uf)); \
      g4 = vis_fpsub16(y, (uvf)); \
      g4 = vis_fpadd16(g4, poff_g); \
      ag = vis_fpmerge(vis_fzeros(), vis_fpack16(g4)); \
      br = vis_fpmerge(vis_fpack16(b4), vis_fpack16(r4)); \
      vis_st64(vis_fpmerge_hi(ag, br), (dp)+(dwid)); \
      vis_st64(vis_fpmerge_lo(ag, br), (dp)+(dwid)+8); \
    };


static void yuv4rgb_vis (Picture *p, unsigned char *dst)
{
  vis_d64 *py;
  vis_f32 *pu,*pv;
  int yw, uvw;
  int i,j;
  vis_d64 y1a,y1b,y2a,y2b,y3a,y3b,y1lh,y1ll,y2lh,y2ll,y3lh,y3ll,y1r,y2r,y3r,
          y1rh,y1rl,y2rh,y2rl,y3rh,y3rl,yp1h,yp1l,yp2h,yp2l,yp3h,yp3l,
          ypw1lh,ypw1ll,ypw2lh,ypw2ll,ypw1rh,ypw1rl,ypw2rh,ypw2rl,
          y1h,y1l,y2h,y2l,y3h,y3l,y4h,y4l,
          u1l,v1l,u2l,v2l,u1r,u2r,v1r,v2r,up1,up2,upw1,upw2,
          vp1,vp2,vpw1,vpw2,u1,v1,u2,v2;
  vis_f32 u1a,v1a,u2a,v2a,u1b,v1b,u2b,v2b;
  vis_f32 Crb_facs = vis_to_float(((22970/4)<<16)|(29032/4)),
          Crbg_facs = vis_to_float(((11700/4)<<16)|(5638/4));
  vis_d64 poff_r = vis_to_double_quad(128 * (22970/4) / 256),
          poff_b = vis_to_double_quad(128 * (29032/4) / 256),
          poff_g = vis_to_double_quad(128 * ((11700+5638)/4) / 256);
  vis_d64 y,u,v,ufh,ufl,vfh,vfl,uvfh,uvfl,r4,g4,b4,ag,br;
 
  pu = (vis_f32 *)p->u;
  pv = (vis_f32 *)p->v;
  py = (vis_d64 *)p->y;
  yw = p->w;
  uvw = p->w/2;
 
  vis_write_gsr(((7-6)<<3) | 1);    /* divide by 4*16, shift by 1 */
 
  for (j=p->h/2; j; j--) {
    for (i=uvw/sizeof(u1a); i; i--) {
      /*
       *  Load 4 U & V pels, interpolate to 2 x 8 each
       */
      u1a = *pu;
      v1a = *pv;
      u2a = *(pu+uvw/sizeof(u2a)); pu++;
      v2a = *(pv+uvw/sizeof(v2a)); pv++;
      u1b = *pu;
      v1b = *pv;
      u2b = *(pu+uvw/sizeof(u2b));
      v2b = *(pv+uvw/sizeof(v2b));
      u1l = vis_fexpand(u1a);
      v1l = vis_fexpand(v1a);
      u2l = vis_fexpand(u2a);
      v2l = vis_fexpand(v2a);
      u1r = vis_fexpand(vis_read_hi(
            vis_faligndata(vis_freg_pair(u1a,u1b),vis_fzero())));
      u2r = vis_fexpand(vis_read_hi(
            vis_faligndata(vis_freg_pair(u2a,u2b),vis_fzero())));
      v1r = vis_fexpand(vis_read_hi(
            vis_faligndata(vis_freg_pair(v1a,v1b),vis_fzero())));
      v2r = vis_fexpand(vis_read_hi(
            vis_faligndata(vis_freg_pair(v2a,v2b),vis_fzero())));
      up1 = vis_fpadd16(u1l,u1r);  /* hor intpol current line */
      up2 = vis_fpadd16(u2l,u2r);  /* hor intpol next line */
      upw1 = vis_fpadd16(u1l,u2l); /* vert intpol left pel */
      upw2 = vis_fpadd16(up1,up2); /* vert intpol right pel */
      vp1 = vis_fpadd16(v1l,v1r);  /* hor intpol current line */
      vp2 = vis_fpadd16(v2l,v2r);  /* hor intpol next line */
      vpw1 = vis_fpadd16(v1l,v2l); /* vert intpol left pel */
      vpw2 = vis_fpadd16(vp1,vp2); /* vert intpol right pel */
      /* multiply values that needed to be divided by 2 instead of 4 with 2 */
      up1 = vis_fpadd16(up1,up1);
      vp1 = vis_fpadd16(vp1,vp1);
      upw1 = vis_fpadd16(upw1,upw1);
      vpw1 = vis_fpadd16(vpw1,vpw1);
      u1 = vis_fpmerge(u1a,vis_fpack16(up1));
      v1 = vis_fpmerge(v1a,vis_fpack16(vp1));
      u2 = vis_fpmerge(vis_fpack16(upw1),vis_fpack16(upw2));
      v2 = vis_fpmerge(vis_fpack16(vpw1),vis_fpack16(vpw2));
 
      /*
       *  Load 2 scanlines of 8 Y pels, interpolate to 4 x 16
       */
      y1a = *py;
      y2a = *(py+yw/sizeof(y2a));
      y3a = *(py+yw/sizeof(y2a)*2); py++;
      y1b = *py;
      y2b = *(py+yw/sizeof(y2b));
      y3b = *(py+yw/sizeof(y2b)*2);
      y1lh = vis_fexpand_hi(y1a);  /* original values */
      y1ll = vis_fexpand_lo(y1a);
      y2lh = vis_fexpand_hi(y2a);
      y2ll = vis_fexpand_lo(y2a);
      y3lh = vis_fexpand_hi(y3a);
      y3ll = vis_fexpand_lo(y3a);
      y1r = vis_faligndata(y1a,y1b);
      y2r = vis_faligndata(y2a,y2b);
      y3r = vis_faligndata(y3a,y3b);
      y1rh = vis_fexpand_hi(y1r);  /* shifted one left */
      y1rl = vis_fexpand_lo(y1r);
      y2rh = vis_fexpand_hi(y2r);
      y2rl = vis_fexpand_lo(y2r);
      y3rh = vis_fexpand_hi(y3r);
      y3rl = vis_fexpand_lo(y3r);
      yp1h = vis_fpadd16(y1lh,y1rh);  /* add orig & shift for hor. int.pol. */
      yp1l = vis_fpadd16(y1ll,y1rl);
      yp2h = vis_fpadd16(y2lh,y2rh);
      yp2l = vis_fpadd16(y2ll,y2rl);
      yp3h = vis_fpadd16(y3lh,y3rh);
      yp3l = vis_fpadd16(y3ll,y3rl);
      ypw1lh = vis_fpadd16(y1lh,y2lh);  /* 2x vert. int.pol. left */
      ypw1ll = vis_fpadd16(y1ll,y2ll);
      ypw2lh = vis_fpadd16(y2lh,y3lh);
      ypw2ll = vis_fpadd16(y2ll,y3ll);
      ypw1rh = vis_fpadd16(yp1h,yp2h);  /* 2x vert. int.pol. right */
      ypw1rl = vis_fpadd16(yp1l,yp2l);
      ypw2rh = vis_fpadd16(yp2h,yp3h);
      ypw2rl = vis_fpadd16(yp2l,yp3l);
      yp1h = vis_fpadd16(yp1h,yp1h);   /* multiply by two to spare GSR change */
      yp1l = vis_fpadd16(yp1l,yp1l);
      yp2h = vis_fpadd16(yp2h,yp2h);
      yp2l = vis_fpadd16(yp2l,yp2l);
      ypw1lh = vis_fpadd16(ypw1lh,ypw1lh);
      ypw1ll = vis_fpadd16(ypw1ll,ypw1ll);
      ypw2lh = vis_fpadd16(ypw2lh,ypw2lh);
      ypw2ll = vis_fpadd16(ypw2ll,ypw2ll);
      y1h = vis_fpmerge(vis_read_hi(y1a),vis_fpack16(yp1h));
      y1l = vis_fpmerge(vis_read_lo(y1a),vis_fpack16(yp1l));
      y2h = vis_fpmerge(vis_fpack16(ypw1lh),vis_fpack16(ypw1rh));
      y2l = vis_fpmerge(vis_fpack16(ypw1ll),vis_fpack16(ypw1rl));
      y3h = vis_fpmerge(vis_read_hi(y2a),vis_fpack16(yp2h));
      y3l = vis_fpmerge(vis_read_lo(y2a),vis_fpack16(yp2l));
      y4h = vis_fpmerge(vis_fpack16(ypw2lh),vis_fpack16(ypw2rh));
      y4l = vis_fpmerge(vis_fpack16(ypw2ll),vis_fpack16(ypw2rl));
      vis_write_gsr(((7-4)<<3) | 1);
 
      /* Prepare u11-u14 & v11-v14 for y11-y18 & y21-y28 */
      /* Convert and write y11-y14 and y21-y24 */
      /* Convert and write y15-y18 and y25-y28 */
      PREP_UV(vis_fpmerge_hi,u1,v1);
      VISCONV(vis_fexpand_hi,y1h,y2h,vfh,ufh,uvfh,dst,yw*8);
      VISCONV(vis_fexpand_lo,y1h,y2h,vfl,ufl,uvfl,dst+16,yw*8);
 
      /* Prepare u21-u24 & v21-v24 for y31-y38 & y41-y48 */
      /* Convert and write y31-y34 and y41-y44 */
      /* Convert and write y35-y38 and y45-y48 */
      PREP_UV(vis_fpmerge_hi,u2,v2);
      VISCONV(vis_fexpand_hi,y3h,y4h,vfh,ufh,uvfh,dst+yw*8*2,yw*8);
      VISCONV(vis_fexpand_lo,y3h,y4h,vfl,ufl,uvfl,dst+yw*8*2+16,yw*8);
      dst+=32;
 
      PREP_UV(vis_fpmerge_lo,u1,v1);
      VISCONV(vis_fexpand_hi,y1l,y2l,vfh,ufh,uvfh,dst,yw*8);
      VISCONV(vis_fexpand_lo,y1l,y2l,vfl,ufl,uvfl,dst+16,yw*8);
      PREP_UV(vis_fpmerge_lo,u2,v2);
      VISCONV(vis_fexpand_hi,y3l,y4l,vfh,ufh,uvfh,dst+yw*8*2,yw*8);
      VISCONV(vis_fexpand_lo,y3l,y4l,vfl,ufl,uvfl,dst+yw*8*2+16,yw*8);
      dst+=32;
 
      vis_write_gsr(((7-6)<<3) | 1);
    }
    /* dst window-width is yw*4 (due to RGB) * 2 (due to int.pol) */
    dst += yw*8 * 3;
    py += yw/8;
  }
}

# else /* not BUGGY */

/*
 *  Same as above, but use different inner loops for interpolation (upscale)
 *  and YUV-conversion. Even conversion is done in seperate loops for different
 *  scanlines. Use small buffer to store intermediate results.
 *  This is due to a compiler bug.
 */

#define PREP_UV(u_in,v_in) \
    { \
      u = vis_fpmerge((u_in),(u_in)); \
      v = vis_fpmerge((v_in),(v_in)); \
      ufh = vis_fmul8x16al_hi(u, Crb_facs); \
      vfh = vis_fmul8x16au_hi(v, Crb_facs); \
      uvfh = vis_fpadd16(vis_fmul8x16au_hi(v, Crbg_facs), \
                         vis_fmul8x16al_hi(u, Crbg_facs)); \
      ufh = vis_fpsub16(ufh, poff_b); \
      vfh = vis_fpsub16(vfh, poff_r); \
      ufl = vis_fmul8x16al(vis_read_lo(u), Crb_facs); \
      vfl = vis_fmul8x16au(vis_read_lo(v), Crb_facs); \
      uvfl = vis_fpadd16(vis_fmul8x16au_lo(v, Crbg_facs), \
                         vis_fmul8x16al_lo(u, Crbg_facs)); \
      ufl = vis_fpsub16(ufl, poff_b); \
      vfl = vis_fpsub16(vfl, poff_r); \
    };
 
#define VISCONV(half,y1_in,y2_in,vf,uf,uvf,dp,dwid) \
    { \
      y = vis_fexpand(half(y1_in)); \
      r4 = vis_fpadd16(y, (vf)); \
      b4 = vis_fpadd16(y, (uf)); \
      g4 = vis_fpsub16(y, (uvf)); \
      g4 = vis_fpadd16(g4, poff_g); \
      ag = vis_fpmerge(vis_fzeros(), vis_fpack16(g4)); \
      br = vis_fpmerge(vis_fpack16(b4), vis_fpack16(r4)); \
      vis_st64(vis_fpmerge(vis_read_hi(ag),vis_read_hi(br)), (dp)); \
      vis_st64(vis_fpmerge(vis_read_lo(ag),vis_read_lo(br)), (dp)+8); \
      y = vis_fexpand(half(y2_in)); \
      r4 = vis_fpadd16(y, (vf)); \
      b4 = vis_fpadd16(y, (uf)); \
      g4 = vis_fpsub16(y, (uvf)); \
      g4 = vis_fpadd16(g4, poff_g); \
      ag = vis_fpmerge(vis_fzeros(), vis_fpack16(g4)); \
      br = vis_fpmerge(vis_fpack16(b4), vis_fpack16(r4)); \
      vis_st64(vis_fpmerge(vis_read_hi(ag), vis_read_hi(br)), (dp)+(dwid)); \
      vis_st64(vis_fpmerge(vis_read_lo(ag), vis_read_lo(br)), (dp)+(dwid)+8); \
    };
 
static void yuv4rgb_vis (Picture *p, unsigned char *dst)
{
  vis_d64 *py;
  vis_f32 *pu,*pv;
  int yw, uvw;
  int i,j;
  vis_d64 y1a,y1b,y2a,y2b,y3a,y3b,y1lh,y1ll,y2lh,y2ll,y3lh,y3ll,y1r,y2r,y3r,
          y1rh,y1rl,y2rh,y2rl,y3rh,y3rl,yp1h,yp1l,yp2h,yp2l,yp3h,yp3l,
          ypw1lh,ypw1ll,ypw2lh,ypw2ll,ypw1rh,ypw1rl,ypw2rh,ypw2rl,
          y1h,y1l,y2h,y2l,y3h,y3l,y4h,y4l,
          u1l,v1l,u2l,v2l,u1r,u2r,v1r,v2r,up1,up2,upw1,upw2,
          vp1,vp2,vpw1,vpw2,u1,v1,u2,v2;
  vis_f32 u1a,v1a,u2a,v2a,u1b,v1b,u2b,v2b;
  vis_f32 Crb_facs = vis_to_float(((22970/4)<<16)|(29032/4)),
          Crbg_facs = vis_to_float(((11700/4)<<16)|(5638/4));
  vis_d64 poff_r = vis_to_double_quad(128 * (22970/4) / 256),
          poff_b = vis_to_double_quad(128 * (29032/4) / 256),
          poff_g = vis_to_double_quad(128 * ((11700+5638)/4) / 256),
          exscale = vis_to_double_quad(1<<(4+8)),
          null = vis_fzero();
  vis_d64 y,u,v,ufh,ufl,vfh,vfl,uvfh,uvfl,r4,g4,b4,ag,br;
 
  static vis_d64 uar[512],var[512],yar[2048];
  vis_d64 *au,*av,*ay;
  vis_f32 *fu,*fv;
 
  pu = (vis_f32 *)p->u;
  pv = (vis_f32 *)p->v;
  py = (vis_d64 *)p->y;
  yw = p->w;
  uvw = p->w/2;
 
  vis_write_gsr(((7-6)<<3) | 1);    /* divide by 4*16, shift by 1 */
 
  for (j=p->h/2; j; j--) {
    au = uar;
    av = var;
    ay = yar;
    vis_write_gsr(((7-6)<<3) | 1);
 
    for (i=uvw/4; i; i--) {
      /*
       *  Load 4 U & V pels, interpolate to 2 x 8 each
       */
      u1a = *pu;
      v1a = *pv;
      u2a = *(pu+uvw/sizeof(u2a)); pu++;
      v2a = *(pv+uvw/sizeof(v2a)); pv++;
      u1b = *pu;
      v1b = *pv;
      u2b = *(pu+uvw/sizeof(u2b));
      v2b = *(pv+uvw/sizeof(v2b));
 
      y1a = *py;
      y2a = *(py+yw/sizeof(y2a));
      y3a = *(py+yw/sizeof(y2a)*2); py++;
      y1b = *py;
      y2b = *(py+yw/sizeof(y2b));
      y3b = *(py+yw/sizeof(y2b)*2);
 
      null = vis_fzero();
      u1l = vis_fexpand(u1a);
      v1l = vis_fexpand(v1a);
      u2l = vis_fexpand(u2a);
      v2l = vis_fexpand(v2a);
      u1r = vis_fexpand_hi(vis_faligndata(vis_freg_pair(u1a,u1b),null));
      u2r = vis_fexpand_hi(vis_faligndata(vis_freg_pair(u2a,u2b),null));
      v1r = vis_fexpand_hi(vis_faligndata(vis_freg_pair(v1a,v1b),null));
      v2r = vis_fexpand_hi(vis_faligndata(vis_freg_pair(v2a,v2b),null));
      up1 = vis_fpadd16(u1l,u1r);  /* hor intpol current line */
      up2 = vis_fpadd16(u2l,u2r);  /* hor intpol next line */
      upw1 = vis_fpadd16(u1l,u2l); /* vert intpol left pel */
      upw2 = vis_fpadd16(up1,up2); /* vert intpol right pel */
      vp1 = vis_fpadd16(v1l,v1r);  /* hor intpol current line */
      vp2 = vis_fpadd16(v2l,v2r);  /* hor intpol next line */
      vpw1 = vis_fpadd16(v1l,v2l); /* vert intpol left pel */
      vpw2 = vis_fpadd16(vp1,vp2); /* vert intpol right pel */
      /* multiply values that needed to be divided by 2 instead of 4 with 2 */
      up1 = vis_fpadd16(up1,up1);
      vp1 = vis_fpadd16(vp1,vp1);
      upw1 = vis_fpadd16(upw1,upw1);
      vpw1 = vis_fpadd16(vpw1,vpw1);
      u1 = vis_fpmerge(u1a,vis_fpack16(up1));
      v1 = vis_fpmerge(v1a,vis_fpack16(vp1));
      u2 = vis_fpmerge(vis_fpack16(upw1),vis_fpack16(upw2));
      v2 = vis_fpmerge(vis_fpack16(vpw1),vis_fpack16(vpw2));
      *au = u1; *(au+(uvw/4)) = u2; au++;
      *av = v1; *(av+(uvw/4)) = v2; av++;
 
      /*
       *  Load 2 scanlines of 8 Y pels, interpolate to 4 x 16
       */
      y1lh = vis_fexpand_hi(y1a);  /* original values */
      y1ll = vis_fexpand_lo(y1a);
      y2lh = vis_fexpand_hi(y2a);
      y2ll = vis_fexpand_lo(y2a);
      y3lh = vis_fexpand_hi(y3a);
      y3ll = vis_fexpand_lo(y3a);
      y1r = vis_faligndata(y1a,y1b);
      y2r = vis_faligndata(y2a,y2b);
      y3r = vis_faligndata(y3a,y3b);
      y1rh = vis_fexpand_hi(y1r);  /* shifted one left */
      y1rl = vis_fexpand_lo(y1r);
      y2rh = vis_fexpand_hi(y2r);
      y2rl = vis_fexpand_lo(y2r);
      y3rh = vis_fexpand_hi(y3r);
      y3rl = vis_fexpand_lo(y3r);
      yp1h = vis_fpadd16(y1lh,y1rh);  /* add orig & shift for hor. int.pol. */
      yp1l = vis_fpadd16(y1ll,y1rl);
      yp2h = vis_fpadd16(y2lh,y2rh);
      yp2l = vis_fpadd16(y2ll,y2rl);
      yp3h = vis_fpadd16(y3lh,y3rh);
      yp3l = vis_fpadd16(y3ll,y3rl);
      ypw1lh = vis_fpadd16(y1lh,y2lh);  /* 2x vert. int.pol. left */
      ypw1ll = vis_fpadd16(y1ll,y2ll);
      ypw2lh = vis_fpadd16(y2lh,y3lh);
      ypw2ll = vis_fpadd16(y2ll,y3ll);
      ypw1rh = vis_fpadd16(yp1h,yp2h);  /* 2x vert. int.pol. right */
      ypw1rl = vis_fpadd16(yp1l,yp2l);
      ypw2rh = vis_fpadd16(yp2h,yp3h);
      ypw2rl = vis_fpadd16(yp2l,yp3l);
      yp1h = vis_fpadd16(yp1h,yp1h);   /* multiply by two to spare GSR change */
      yp1l = vis_fpadd16(yp1l,yp1l);
      yp2h = vis_fpadd16(yp2h,yp2h);
      yp2l = vis_fpadd16(yp2l,yp2l);
      ypw1lh = vis_fpadd16(ypw1lh,ypw1lh);
      ypw1ll = vis_fpadd16(ypw1ll,ypw1ll);
      ypw2lh = vis_fpadd16(ypw2lh,ypw2lh);
      ypw2ll = vis_fpadd16(ypw2ll,ypw2ll);
      *ay = vis_fpmerge(vis_read_hi(y1a),vis_fpack16(yp1h));      /* y1h */
      *(ay+uvw) =
            vis_fpmerge(vis_read_hi(y2a),vis_fpack16(yp2h));      /* y3h */
      ay++;
      *ay = vis_fpmerge(vis_fpack16(ypw1lh),vis_fpack16(ypw1rh)); /* y2h */
      *(ay+uvw) =
            vis_fpmerge(vis_fpack16(ypw2lh),vis_fpack16(ypw2rh)); /* y4h */
      ay++;
      *ay = vis_fpmerge(vis_read_lo(y1a),vis_fpack16(yp1l));      /* y1l */
      *(ay+uvw) =
            vis_fpmerge(vis_read_lo(y2a),vis_fpack16(yp2l));      /* y3l */
      ay++;
      *ay = vis_fpmerge(vis_fpack16(ypw1ll),vis_fpack16(ypw1rl)); /* y2l */
      *(ay+uvw) =
            vis_fpmerge(vis_fpack16(ypw2ll),vis_fpack16(ypw2rl)); /* y4l */
      ay++;
    }
    vis_write_gsr(((7-4)<<3) | 1);
    fu = (vis_f32 *)uar;
    fv = (vis_f32 *)var;
    ay = yar;
 
    /* first two scanlines */
    for (i=uvw/2; i; i--) {
      vis_f32 up = *fu++,
              vp = *fv++;
      y1h = *ay++;
      y2h = *ay++;
      PREP_UV(up,vp);
      VISCONV(vis_read_hi,y1h,y2h,vfh,ufh,uvfh,dst,yw*8);
      VISCONV(vis_read_lo,y1h,y2h,vfl,ufl,uvfl,dst+16,yw*8);
      dst +=32;
    }
    dst += yw*8;
 
    /* next two scanlines */
    for (i=uvw/2; i; i--) {
      vis_f32 up = *fu++,
              vp = *fv++;
      y1h = *ay++;
      y2h = *ay++;
      PREP_UV(up,vp);
      VISCONV(vis_read_hi,y1h,y2h,vfh,ufh,uvfh,dst,yw*8);
      VISCONV(vis_read_lo,y1h,y2h,vfl,ufl,uvfl,dst+16,yw*8);
      dst +=32;
    }
    dst += yw*8;
    /* dst window-width is yw*4 (due to aRGB) * 2 (due to int.pol) */
    py += yw/8;
  }
}
# endif /* not BUGGY */
#endif /* !WITHOUT_VIS */

/* ------------------------------------------------------------------------- */
/*
 *  Interpolate complete Y frame with simple [1/2,1/2] weighting
 *  for combined grayscale & expand mode
 *  (hence YUV conversion not needed)
 *
 *  Each pixel *ii is up-scaled to 4 pixels in *oi as follows:
 *
 *    *oi          = *ii;
 *    *(oi+1)      = (*ii + *(ii+1) + 1) / 2;
 *    *(oi+iw*2)   = (*ii + *(ii+iw) + 1) / 2;
 *    *(oi+iw*2+1) = (*ii + *(ii+1) + *(ii+iw) + *(ii+iw+1) + 2) / 4;
 */

static void expand_gray(Byte *ii, Byte *oi, int iw, int ih)
{
  int ow = iw*2/4;
  int i,j;
  unsigned int yc = 0;
  unsigned int ycp1,ycp2,ycp3,ycp4,yn,ynp1,ynp2,ynp3,ynp4;
  unsigned long *out = (unsigned long*)oi;

  for (j=ih-1; j; j--) {
    yc = *ii;
    yn = *(ii+iw);
    ii += 1;
    for (i=iw/4-1; i; i--) {
      ycp1 = *ii;
      ycp2 = *(ii+1);
      ycp3 = *(ii+2);
      ycp4 = *(ii+3);
      ynp1 = *(ii+iw);
      ynp2 = *(ii+iw+1);
      ynp3 = *(ii+iw+2);
      ynp4 = *(ii+iw+3);
      ii += 4;

      *out =
        (yc<<24) | (ycp1<<8) |
	((((((yc + ycp1)<<16)|(ycp1 + ycp2)) + 0x10001) >> 1) & 0xff00ff);
      *(out+1) =
        (ycp2<<24) | (ycp3<<8) |
        ((((((ycp2 + ycp3)<<16)|(ycp3 + ycp4)) + 0x10001) >> 1) & 0xff00ff);
      *(out+ow) =
        (((((yc + yn)<<23)|((ycp1 + ynp1)<<7)) + 0x800080) & 0xff00ff00) |
        ((((((yc + ycp1 + yn + ynp1)<<16) | (ycp1 + ycp2 + ynp1 + ynp2))
	  + 0x20002) >> 2) & 0xff00ff);
      *(out+ow+1) =
	(((((ycp2 + ynp2)<<23)|((ycp3 + ynp3)<<7)) + 0x800080) & 0xff00ff00) |
	((((((ycp2 + ycp3 + ynp2 + ynp3)<<16) | (ycp3 + ycp4 + ynp3 + ynp4))
	  + 0x20002) >> 2) & 0xff00ff);
      out += 2;

      yc = ycp4;
      yn = ynp4;
    }
    /* Rightmost pixel of scanline
     */
    ycp1 = *ii;
    ycp2 = *(ii+1);
    ycp3 = *(ii+2);
    ynp1 = *(ii+iw);
    ynp2 = *(ii+iw+1);
    ynp3 = *(ii+iw+2);
    ii += 3;

    *out =
      (yc<<24) | (ycp1<<8) |
      ((((((yc + ycp1)<<16)|(ycp1 + ycp2)) + 0x10001) >> 1) & 0xff00ff);
    *(out+1) =
      (ycp2<<24) | (((ycp2 + ycp3 + 1)&0x1fe)<<15) | (ycp3<<8) | ycp3;
    *(out+ow) =
      (((((yc + yn)<<23)|((ycp1 + ynp1)<<7)) + 0x800080) & 0xff00ff00) |
      ((((((yc + ycp1 + yn + ynp1)<<16) | (ycp1 + ycp2 + ynp1 + ynp2))
	+ 0x20002) >> 2) & 0xff00ff);
    *(out+ow+1) =
      (((((ycp2 + ynp2)<<23)|((ycp3 + ynp3)<<7)) + 0x800080) & 0xff00ff00) |
      ((((((ycp2 + ycp3 + ynp2 + ynp3)<<16) | (ycp3 + ycp3 + ynp3 + ynp3))
	+ 0x20002) >> 2) & 0xff00ff);
    out += 2 + ow;
  }
  /*  Last scanline: must not read scanline below for interpolation
   */
  for (i=iw/4-1; i; i--) {
    ycp1 = *ii;
    ycp2 = *(ii+1);
    ycp3 = *(ii+2);
    ycp4 = *(ii+3);
    ii += 4;

    *out =
    *(out+ow) =
      (yc<<24) | (ycp1<<8) |
      ((((((yc + ycp1)<<16)|(ycp1 + ycp2)) + 0x10001) >> 1) & 0xff00ff);
    *(out+1) =
    *(out+ow+1) =
      (ycp2<<24) | (ycp3<<8) |
      ((((((ycp2 + ycp3)<<16)|(ycp3 + ycp4)) + 0x10001) >> 1) & 0xff00ff);
    out += 2;

    yc = ycp4;
  }
  /*  Last pixel of last scanline
   */
  ycp1 = *ii;
  ycp2 = *(ii+1);
  ycp3 = *(ii+2);

  *out =
  *(out+ow) =
    (yc<<24) | (ycp1<<8) |
    ((((((yc + ycp1)<<16)|(ycp1 + ycp2)) + 0x10001) >> 1) & 0xff00ff);
  *(out+1) =
  *(out+ow+1) =
    (ycp2<<24) | (((ycp2 + ycp3 + 1)&0x1fe)<<15) | (ycp3<<8) | ycp3;
}

#ifndef WITHOUT_VIS

/* ------------------------------------------------------------------------- */
/*
 *  Same as above, but usig VIS
 */

static void expand_gray_vis(Byte *ii, Byte *oi, int iw, int ih)
{
  vis_d64 y1a,y2a,y3a,y1b,y2b,y3b,y1lh,y1ll,y2lh,y2ll,y3lh,y3ll,y1r,y2r,y3r,
          y1rh,y1rl,y2rh,y2rl,y3rh,y3rl,yp1h,yp1l,yp2h,yp2l,yp3h,yp3l,
          ypw1lh,ypw1ll,ypw2lh,ypw2ll,ypw1rh,ypw1rl,ypw2rh,ypw2rl,
          y1h,y1l,y2h,y2l,y3h,y3l,y4h,y4l;
  Byte *inp,*outp;
  int ow;
  int i,j;
 
  vis_write_gsr(((7-6)<<3) | 1);    /* divide by 4*16, shift by 1 */
  ow = 2*iw;
 
  for (j=ih/2; j; j--) {
    outp = oi; oi += 4*ow;
    inp = ii; ii += 2*iw;
    for (i=iw/8; i; i--) {
      y1a = vis_ld64(inp);
      y2a = vis_ld64(inp+iw);
      y3a = vis_ld64(inp+2*iw); inp+=8;
      y1b = vis_ld64(inp);
      y2b = vis_ld64(inp+iw);
      y3b = vis_ld64(inp+2*iw);
      y1lh = vis_fexpand_hi(y1a);  /* original values */
      y1ll = vis_fexpand_lo(y1a);
      y2lh = vis_fexpand_hi(y2a);
      y2ll = vis_fexpand_lo(y2a);
      y3lh = vis_fexpand_hi(y3a);
      y3ll = vis_fexpand_lo(y3a);
      y1r = vis_faligndata(y1a,y1b);
      y2r = vis_faligndata(y2a,y2b);
      y3r = vis_faligndata(y3a,y3b);
      y1rh = vis_fexpand_hi(y1r);  /* shifted one left */
      y1rl = vis_fexpand_lo(y1r);
      y2rh = vis_fexpand_hi(y2r);
      y2rl = vis_fexpand_lo(y2r);
      y3rh = vis_fexpand_hi(y3r);
      y3rl = vis_fexpand_lo(y3r);
      yp1h = vis_fpadd16(y1lh,y1rh);  /* add orig & shift for hor. int.pol. */
      yp1l = vis_fpadd16(y1ll,y1rl);
      yp2h = vis_fpadd16(y2lh,y2rh);
      yp2l = vis_fpadd16(y2ll,y2rl);
      yp3h = vis_fpadd16(y3lh,y3rh);
      yp3l = vis_fpadd16(y3ll,y3rl);
      ypw1lh = vis_fpadd16(y1lh,y2lh);  /* 2x vert. int.pol. left */
      ypw1ll = vis_fpadd16(y1ll,y2ll);
      ypw2lh = vis_fpadd16(y2lh,y3lh);
      ypw2ll = vis_fpadd16(y2ll,y3ll);
      ypw1rh = vis_fpadd16(yp1h,yp2h);  /* 2x vert. int.pol. right */
      ypw1rl = vis_fpadd16(yp1l,yp2l);
      ypw2rh = vis_fpadd16(yp2h,yp3h);
      ypw2rl = vis_fpadd16(yp2l,yp3l);
      yp1h = vis_fpadd16(yp1h,yp1h);   /* multiply by two to spare GSR change */
      yp1l = vis_fpadd16(yp1l,yp1l);
      yp2h = vis_fpadd16(yp2h,yp2h);
      yp2l = vis_fpadd16(yp2l,yp2l);
      ypw1lh = vis_fpadd16(ypw1lh,ypw1lh);
      ypw1ll = vis_fpadd16(ypw1ll,ypw1ll);
      ypw2lh = vis_fpadd16(ypw2lh,ypw2lh);
      ypw2ll = vis_fpadd16(ypw2ll,ypw2ll);
      y1h = vis_fpmerge(vis_read_hi(y1a),vis_fpack16(yp1h));
      y1l = vis_fpmerge(vis_read_lo(y1a),vis_fpack16(yp1l));
      y2h = vis_fpmerge(vis_fpack16(ypw1lh),vis_fpack16(ypw1rh));
      y2l = vis_fpmerge(vis_fpack16(ypw1ll),vis_fpack16(ypw1rl));
      y3h = vis_fpmerge(vis_read_hi(y2a),vis_fpack16(yp2h));
      y3l = vis_fpmerge(vis_read_lo(y2a),vis_fpack16(yp2l));
      y4h = vis_fpmerge(vis_fpack16(ypw2lh),vis_fpack16(ypw2rh));
      y4l = vis_fpmerge(vis_fpack16(ypw2ll),vis_fpack16(ypw2rl));
      vis_st64(y1h, outp);      vis_st64(y1l, outp+8);
      vis_st64(y2h, outp+ow);   vis_st64(y2l, outp+ow+8);
      vis_st64(y3h, outp+2*ow); vis_st64(y3l, outp+2*ow+8);
      vis_st64(y4h, outp+3*ow); vis_st64(y4l, outp+3*ow+8);
      outp+=16;
    }
  }
}
#endif /* !WITHOUT_VIS */

/* ------------------------------------------------------------------------- */
/*
 *  Dispatcher
 */

#ifndef WITHOUT_THREADS
static void yuv2rgb_thread(Picture *p)
{
#else
void yuv2rgb(Picture *p)
{
#endif
  /*  return; */
#ifndef WITHOUT_DGA
  int fbx, fby, fbw, fbh;

  if(Options.use_dga && dga_id) {
    DGA_DRAW_LOCK(dga_id, (short)-1);
    dga_draw_bbox(dga_id, &fbx,&fby,&fbw,&fbh);

    switch(dga_draw_visibility(dga_id)) {
      case DGA_VIS_PARTIALLY_OBSCURED:
      {
	/* get clipping rectangles */
	int x0,y0,x1,y1;
	short *ptr;

	ptr = dga_draw_clipinfo(dga_id);
	while((y0 = *ptr++) != DGA_Y_EOL) {
	  y1 = *ptr++;
	  while((x0 = *ptr++) != DGA_X_EOL) {
	    x1 = *ptr++;
	    /* printf("rec from (%d,%d) to (%d,%d)\n", x0,y0,x1,y1); */
	    yuv2rgb_table_dga(p,
	      fbx,fby, x0-fbx, y0-fby, x1-x0-1, y1-y0);
	  }
	}
      }; break;
      case DGA_VIS_UNOBSCURED:
	/* printf("%d,%d,%d,%d\n", fbx,fby,fbw,fbh); */
	yuv2rgb_table_dga(p, fbx,fby, 0,0, fbw, fbh);
	break;
      case DGA_VIS_FULLY_OBSCURED:
        /* printf("window invisible\n"); */
	break;
      default:
        fprintf(stderr, "DGA visability error\n");
    }
    DGA_DRAW_UNLOCK(dga_id);
  }
  else
#endif
  if(visdepth >= 24) {
#ifndef WITHOUT_VIS
    if (Options.use_vis)
      {
	if(Options.visexpand) yuv4rgb_vis(p, (unsigned char *)rgb_image);
	else yuv2rgb_vis(p, (unsigned long *)rgb_image);
      }
    else
#endif /* !WITHOUT_VIS */
      {
	yuv2rgb_table(p, (unsigned long *)rgb_image);
	/* BUG yuv4rgb is buggy!!! Produces artefacts at the bottom of the frame */
	/*
	if(Options.visexpand) yuv4rgb(p, (unsigned long *)rgb_image);
	else yuv2rgb_table(p, (unsigned long *)rgb_image);
	*/
	/*
	yuv2rgb_naive(p, (unsigned long *)rgb_image);
	*/
      }
    display_image();
  }
  else if(Options.visgray) {
    if(Options.visexpand) {
#ifndef WITHOUT_VIS
      if (Options.use_vis)
	expand_gray_vis(p->y, (Byte *)rgb_image,
			p->w, p->h);
      else
#endif
	expand_gray(p->y, (Byte *)rgb_image,
		    p->w, p->h);
    }
    else rgb_image = p->y;  /* no conversion needed */
    display_image();
  }
  else {
    ditherframe(p, rgb_image);
    display_image();
  }
}

/*
 *  Thread management
 */

#ifndef WITHOUT_THREADS
void yuv2rgb(Picture *p)
{
  static int running = 0;	/* XXX: Fixme: Die! Die! Die! */
  thread_t ptid;

  if (Options.thread_yuv)
    {
      if(running) thr_join(0, NULL, NULL);	/* XXX: no globals please! */

      if(thr_create(NULL, 0, (void *(*)(void *))yuv2rgb_thread, (void *)p, THR_NEW_LWP, &ptid) != 0){
	perror("thr_create");
      }
      else running = 1;		/* XXX: no globals please! */
    }
  else
    yuv2rgb_thread(p);
}
#endif /* !WITHOUT_THREADS */
