dct.cpp 一个可以在局域网进行视频聊天的源代码 - 语音压缩源码

www.gusucode.com > 一个可以在局域网进行视频聊天的源代码 > 一个可以在局域网进行视频聊天的源代码/VoIP/encoder/dct.cpp
    ////////////////////////////////////////////////////////////////////////////
//
//
//    Project     : VideoNet version 1.1.
//    Description : Peer to Peer Video Conferencing over the LAN.
//	  Author      :	Nagareshwar Y Talekar ( nsry2002@yahoo.co.in)
//    Date        : 15-6-2004.
//
//    I have converted origional fast h.263 encoder library from C to C++ 
//	  so that it can be integrated into any windows application easily.
//	  I have removed some of unnecessary codes/files from the
//	  fast h263 library.Also moved definitions and declarations
//	  in their proper .h and .cpp files.
//
//    File description : 
//    Name    : dct.cpp
//
//
/////////////////////////////////////////////////////////////////////////////

/*************************************************
 * libr263: fast H.263 encoder library
 *
 * Copyright (C) 1996, Roalt Aalmoes, Twente University
 * SPA multimedia group
 *
 * Based on Telenor TMN 1.6 encoder (Copyright (C) 1995, Telenor R&D)
 * created by Karl Lillevold 
 *
 * Author encoder: Roalt Aalmoes, <aalmoes@huygens.nl>
 * 
 * Date: 31-07-96
 **************************************************/

/*****************************************************************
 * Some routines are translated from Gisle Bj鴑tegaards's FORTRAN
 * routines by Robert.Danielsen@nta.no
 *
 *****************************************************************/

/*****************************************************************
 * This source includes sources from Berkeley's MPEG-1 encoder
 * which are copyright of Berkeley University, California, USA
 *****************************************************************/


#include "dct.h"


#include <math.h>

#ifndef PI
# ifdef M_PI
#  define PI M_PI
# else
#  define PI 3.14159265358979323846
# endif
#endif

int	zigzag[8][8] = {
  {0, 1, 5, 6,14,15,27,28},
  {2, 4, 7,13,16,26,29,42},
  {3, 8,12,17,25,30,41,43},
  {9,11,18,24,31,40,44,53},
  {10,19,23,32,39,45,52,54},
  {20,22,33,38,46,51,55,60},
  {21,34,37,47,50,56,59,61},
  {35,36,48,49,57,58,62,63},
};

#ifndef FASTDCT

/**********************************************************************
 *
 *	Name:		Dct
 *	Description:	Does dct on an 8x8 block, does zigzag-scanning of
 *			coefficients
 *
 *	Input:		64 pixels in a 1D array
 *	Returns:	64 coefficients in a 1D array
 *	Side effects:	
 *
 *	Date: 930128	Author: Robert.Danielsen@nta.no
 *
 **********************************************************************/

int Dct( int *block, int *coeff)
{
  int        j1, i, j, k;
  float	b[8];
  float 	b1[8];
  float 	d[8][8];
  float 	f0=.7071068,f1=.4903926,f2=.4619398,f3=.4157348,f4=.3535534;
  float 	f5=.2777851,f6=.1913417,f7=.0975452;

  for (i = 0, k = 0; i < 8; i++, k += 8) {
    for (j = 0; j < 8; j++) {
      b[j] = block[k+j];
    }
    /* Horizontal transform */
    for (j = 0; j < 4; j++) {
      j1 = 7 - j;
      b1[j] = b[j] + b[j1];
      b1[j1] = b[j] - b[j1];
    }
    b[0] = b1[0] + b1[3];
    b[1] = b1[1] + b1[2];
    b[2] = b1[1] - b1[2];
    b[3] = b1[0] - b1[3];
    b[4] = b1[4];
    b[5] = (b1[6] - b1[5]) * f0;
    b[6] = (b1[6] + b1[5]) * f0;
    b[7] = b1[7];
    d[i][0] = (b[0] + b[1]) * f4;
    d[i][4] = (b[0] - b[1]) * f4;
    d[i][2] = b[2] * f6 + b[3] * f2;
    d[i][6] = b[3] * f6 - b[2] * f2;
    b1[4] = b[4] + b[5];
    b1[7] = b[7] + b[6];
    b1[5] = b[4] - b[5];
    b1[6] = b[7] - b[6];
    d[i][1] = b1[4] * f7 + b1[7] * f1;
    d[i][5] = b1[5] * f3 + b1[6] * f5;
    d[i][7] = b1[7] * f7 - b1[4] * f1;
    d[i][3] = b1[6] * f3 - b1[5] * f5;
  }
  /* Vertical transform */
  for (i = 0; i < 8; i++) {
    for (j = 0; j < 4; j++) {
      j1 = 7 - j;
      b1[j] = d[j][i] + d[j1][i];
      b1[j1] = d[j][i] - d[j1][i];
    }
    b[0] = b1[0] + b1[3];
    b[1] = b1[1] + b1[2];
    b[2] = b1[1] - b1[2];
    b[3] = b1[0] - b1[3];
    b[4] = b1[4];
    b[5] = (b1[6] - b1[5]) * f0;
    b[6] = (b1[6] + b1[5]) * f0;
    b[7] = b1[7];
    d[0][i] = (b[0] + b[1]) * f4;
    d[4][i] = (b[0] - b[1]) * f4;
    d[2][i] = b[2] * f6 + b[3] * f2;
    d[6][i] = b[3] * f6 - b[2] * f2;
    b1[4] = b[4] + b[5];
    b1[7] = b[7] + b[6];
    b1[5] = b[4] - b[5];
    b1[6] = b[7] - b[6];
    d[1][i] = b1[4] * f7 + b1[7] * f1;
    d[5][i] = b1[5] * f3 + b1[6] * f5;
    d[7][i] = b1[7] * f7 - b1[4] * f1;
    d[3][i] = b1[6] * f3 - b1[5] * f5;
  }
  /* Zigzag - scanning */
  for (i = 0; i < 8; i++) {
    for (j = 0; j < 8; j++) {
      *(coeff + zigzag[i][j]) = (int)(d[i][j]);
    }
  }
  return 0;
}

#else
				/* Start of MPEG DCT operation */

typedef unsigned char uint8;
typedef char int8;
typedef unsigned short uint16;
typedef short int16;

#ifdef LONG_32		
typedef unsigned long uint32;
typedef long int32;
#else
typedef unsigned int uint32;
typedef int int32;
#endif

/* this is ansi.h */


#undef _ANSI_ARGS_
#undef const

#ifdef NON_ANSI_COMPILER
#define _ANSI_ARGS_(x)       ()
#define CONST

#else

#define _ANSI_ARGS_(x)   x
#define CONST const

#ifdef __cplusplus
#define VARARGS (...)
#else
#define VARARGS ()
#endif

#endif


#define DCTSIZE     8   /* you really don't want to change this */
#define DCTSIZE_SQ 64   /* you really don't want to change this */

#define DCTSIZE2    DCTSIZE*DCTSIZE
typedef short DCTELEM;
typedef DCTELEM DCTBLOCK[DCTSIZE2];
typedef DCTELEM DCTBLOCK_2D[DCTSIZE][DCTSIZE];

/* We assume that right shift corresponds to signed division by 2 with
 * rounding towards minus infinity.  This is correct for typical "arithmetic
 * shift" instructions that shift in copies of the sign bit.  But some
 * C compilers implement >> with an unsigned shift.  For these machines you
 * must define RIGHT_SHIFT_IS_UNSIGNED.
 * RIGHT_SHIFT provides a proper signed right shift of an int32 quantity.
 * It is only applied with constant shift counts.  SHIFT_TEMPS must be
 * included in the variables of any routine using RIGHT_SHIFT.
 */

#ifdef RIGHT_SHIFT_IS_UNSIGNED
#define SHIFT_TEMPS     int32 shift_temp

/*
#define RIGHT_SHIFT(x,shft)  ((shift_temp = (x)) < 0 ? (shift_temp >> (shft)) |((~((int32) 0)) << (32-(shft))) : (shift_temp >> (shft)))
     
	 */   

#else
#define SHIFT_TEMPS
#define RIGHT_SHIFT(x,shft)     ((x) >> (shft))
#endif
          
#define LG2_DCT_SCALE 16

#define ONE	((int32) 1)

#define DCT_SCALE (ONE << LG2_DCT_SCALE)

/* In some places we shift the inputs left by a couple more bits, */
/* so that they can be added to fractional results without too much */
     /* loss of precision. */
#define LG2_OVERSCALE 2
#define OVERSCALE  (ONE << LG2_OVERSCALE)
#define OVERSHIFT(x)  ((x) <<= LG2_OVERSCALE)

/* Scale a fractional constant by DCT_SCALE */
#define FIX(x)	((int32) ((x) * DCT_SCALE + 0.5))

/* Scale a fractional constant by DCT_SCALE/OVERSCALE */
/* Such a constant can be multiplied with an overscaled input */
/* to produce something that's scaled by DCT_SCALE */
#define FIXO(x)  ((int32) ((x) * DCT_SCALE / OVERSCALE + 0.5))

/* Descale and correctly round a value that's scaled by DCT_SCALE */
#define UNFIX(x)   RIGHT_SHIFT((x) + (ONE << (LG2_DCT_SCALE-1)), LG2_DCT_SCALE)

/* Same with an additional division by 2, ie, correctly rounded UNFIX(x/2) */
#define UNFIXH(x)  RIGHT_SHIFT((x) + (ONE << LG2_DCT_SCALE), LG2_DCT_SCALE+1)

/* Take a value scaled by DCT_SCALE and round to integer scaled by OVERSCALE */
#define UNFIXO(x)  RIGHT_SHIFT((x) + (ONE << (LG2_DCT_SCALE-1-LG2_OVERSCALE)),LG2_DCT_SCALE-LG2_OVERSCALE)


/* Here are the constants we need */
/* SIN_i_j is sine of i*pi/j, scaled by DCT_SCALE */
/* COS_i_j is cosine of i*pi/j, scaled by DCT_SCALE */

#define SIN_1_4 FIX(0.707106781)
#define COS_1_4 SIN_1_4

#define SIN_1_8 FIX(0.382683432)
#define COS_1_8 FIX(0.923879533)
#define SIN_3_8 COS_1_8
#define COS_3_8 SIN_1_8

#define SIN_1_16 FIX(0.195090322)
#define COS_1_16 FIX(0.980785280)
#define SIN_7_16 COS_1_16
#define COS_7_16 SIN_1_16

#define SIN_3_16 FIX(0.555570233)
#define COS_3_16 FIX(0.831469612)
#define SIN_5_16 COS_3_16
#define COS_5_16 SIN_3_16

/* OSIN_i_j is sine of i*pi/j, scaled by DCT_SCALE/OVERSCALE */
/* OCOS_i_j is cosine of i*pi/j, scaled by DCT_SCALE/OVERSCALE */

#define OSIN_1_4 FIXO(0.707106781)
#define OCOS_1_4 OSIN_1_4

#define OSIN_1_8 FIXO(0.382683432)
#define OCOS_1_8 FIXO(0.923879533)
#define OSIN_3_8 OCOS_1_8
#define OCOS_3_8 OSIN_1_8

#define OSIN_1_16 FIXO(0.195090322)
#define OCOS_1_16 FIXO(0.980785280)
#define OSIN_7_16 OCOS_1_16
#define OCOS_7_16 OSIN_1_16

#define OSIN_3_16 FIXO(0.555570233)
#define OCOS_3_16 FIXO(0.831469612)
#define OSIN_5_16 OCOS_3_16
#define OCOS_5_16 OSIN_3_16
     
/*==================*
 * TYPE DEFINITIONS *
 *==================*/

/*  
 *  your basic Block type
 */
typedef int32 Block[DCTSIZE][DCTSIZE];
typedef int32 FlatBlock[DCTSIZE_SQ];
typedef	    int32   LumBlock[2*DCTSIZE][2*DCTSIZE];
typedef	    int32   ChromBlock[DCTSIZE][DCTSIZE];



/* Prototypes */
void reference_fwd_dct(Block block, Block dest);

void mp_fwd_dct_fast(Block data2d, Block dest2d);

/*void mp_fwd_dct_fast _ANSI_ARGS_((int16 *data2d, int16 *dest2d));
 */
void init_fdct(void);

/*
 * --------------------------------------------------------------
 *
 * mp_fwd_dct_fast --
 *
 * Perform the forward DCT on one block of samples.
 *
 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT on each
 * column.
 *
 * Results: None
 *
 * Side effects: Overwrites the input data
 *
 * --------------------------------------------------------------
 */
/*
__inline__ voidmp_fwd_dct_fast(data2d, dest2d)
    Block data2d, dest2d;
    */
__inline__ void mp_fwd_dct_fast(Block data2d, Block dest2d)
{
    int32 *data = (int32 *) data2d;	/* this algorithm wants
					 * a 1-d array */
    int32 *dest = (int32 *) dest2d;
    int rowctr, columncounter;
    register int32 *inptr, *outptr;
    int32 workspace[DCTSIZE_SQ];
    int32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int32 tmp10, tmp11, tmp12, tmp13;
    int32 tmp14, tmp15, tmp16, tmp17;
    int32 tmp25, tmp26;

    SHIFT_TEMPS;

    /*
     * Each iteration of the inner loop performs one 8-point 1-D DCT. It
     * reads from a *row* of the input matrix and stores into a *column*
     * of the output matrix.  In the first pass, we read from the data[]
     * array and store into the local workspace[].  In the second pass,
     * we read from the workspace[] array and store into data[], thus
     * performing the equivalent of a columnar DCT pass with no variable
     * array indexing.
     */

    inptr = data;		/* initialize pointers for first pass */
    outptr = workspace;
   
    /* PASS ONE */
	
    for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) {
      /*
       * many tmps have nonoverlapping lifetime -- flashy
       * register colourers should be able to do this lot
       * very well
       */
      /* SHIFT_TEMPS */
      
      /* temp0 through tmp7:  -512 to +512 */
      /* if I-block, then -256 to +256 */
      tmp0 = inptr[7] + inptr[0];
      tmp1 = inptr[6] + inptr[1];
      tmp2 = inptr[5] + inptr[2];
      tmp3 = inptr[4] + inptr[3];
      tmp4 = inptr[3] - inptr[4];
      tmp5 = inptr[2] - inptr[5];
      tmp6 = inptr[1] - inptr[6];
      tmp7 = inptr[0] - inptr[7];
      
      /* tmp10 through tmp13:  -1024 to +1024 */
      /* if I-block, then -512 to +512 */
      tmp10 = tmp3 + tmp0;
      tmp11 = tmp2 + tmp1;
      tmp12 = tmp1 - tmp2;
      
      tmp13 = tmp0 - tmp3;
      
      outptr[0] = (int32) UNFIXH((tmp10 + tmp11) * SIN_1_4);
      outptr[DCTSIZE * 4] = (int32) UNFIXH((tmp10 - tmp11) * COS_1_4);
      
      outptr[DCTSIZE * 2] = (int32) UNFIXH(tmp13 * COS_1_8 + tmp12 * SIN_1_8);
      outptr[DCTSIZE * 6] = (int32) UNFIXH(tmp13 * SIN_1_8 - tmp12 * COS_1_8);
      
      tmp16 = UNFIXO((tmp6 + tmp5) * SIN_1_4);
      tmp15 = UNFIXO((tmp6 - tmp5) * COS_1_4);
      
      OVERSHIFT(tmp4);
      OVERSHIFT(tmp7);
      
      /*
       * tmp4, tmp7, tmp15, tmp16 are overscaled by
       * OVERSCALE
       */
      
      tmp14 = tmp4 + tmp15;
      tmp25 = tmp4 - tmp15;
      tmp26 = tmp7 - tmp16;
      tmp17 = tmp7 + tmp16;
      
      outptr[DCTSIZE] = (int32) UNFIXH(tmp17 * OCOS_1_16 + tmp14 * OSIN_1_16);
      outptr[DCTSIZE * 7] = (int32) UNFIXH(tmp17 * OCOS_7_16 - tmp14 * OSIN_7_16);
      outptr[DCTSIZE * 5] = (int32) UNFIXH(tmp26 * OCOS_5_16 + tmp25 * OSIN_5_16);
      outptr[DCTSIZE * 3] = (int32) UNFIXH(tmp26 * OCOS_3_16 - tmp25 * OSIN_3_16);
      
      inptr += DCTSIZE;	/* advance inptr to next row */
      outptr++;		/* advance outptr to next column */
    
    }

    /* end of pass; in case it was pass 1, set up for pass 2 */
    inptr = workspace;
    outptr = dest;

    columncounter = 0;
    /* PASS TWO */

    for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) {
      /*
       * many tmps have nonoverlapping lifetime -- flashy
       * register colourers should be able to do this lot
       * very well
       */
      /* SHIFT_TEMPS */
      
      /* temp0 through tmp7:  -512 to +512 */
      /* if I-block, then -256 to +256 */
      tmp0 = inptr[7] + inptr[0];
      tmp1 = inptr[6] + inptr[1];
      tmp2 = inptr[5] + inptr[2];
      tmp3 = inptr[4] + inptr[3];
      tmp4 = inptr[3] - inptr[4];
      tmp5 = inptr[2] - inptr[5];
      tmp6 = inptr[1] - inptr[6];
      tmp7 = inptr[0] - inptr[7];
      
      /* tmp10 through tmp13:  -1024 to +1024 */
      /* if I-block, then -512 to +512 */
      tmp10 = tmp3 + tmp0;
      tmp11 = tmp2 + tmp1;
      tmp12 = tmp1 - tmp2;
      
      tmp13 = tmp0 - tmp3;
      
      outptr[ zigzag[0][columncounter] ] = (int32) UNFIXH((tmp10 + tmp11) * SIN_1_4);
      outptr[ zigzag[4][columncounter] ] = (int32) UNFIXH((tmp10 - tmp11) * COS_1_4);
      
      outptr[ zigzag[2][columncounter] ] = (int32) UNFIXH(tmp13 * COS_1_8 + tmp12 * SIN_1_8);
      outptr[ zigzag[6][columncounter] ] = (int32) UNFIXH(tmp13 * SIN_1_8 - tmp12 * COS_1_8);
      
      tmp16 = UNFIXO((tmp6 + tmp5) * SIN_1_4);
      tmp15 = UNFIXO((tmp6 - tmp5) * COS_1_4);
      
      OVERSHIFT(tmp4);
      OVERSHIFT(tmp7);
      
      /*
       * tmp4, tmp7, tmp15, tmp16 are overscaled by
       * OVERSCALE
       */
      
      tmp14 = tmp4 + tmp15;
      tmp25 = tmp4 - tmp15;
      tmp26 = tmp7 - tmp16;
      tmp17 = tmp7 + tmp16;
      
      outptr[ zigzag[1][columncounter] ] = (int32) UNFIXH(tmp17 * OCOS_1_16 + tmp14 * OSIN_1_16);
      outptr[ zigzag[7][columncounter] ] = (int32) UNFIXH(tmp17 * OCOS_7_16 - tmp14 * OSIN_7_16);
      outptr[ zigzag[5][columncounter] ] = (int32) UNFIXH(tmp26 * OCOS_5_16 + tmp25 * OSIN_5_16);
      outptr[ zigzag[3][columncounter] ] = (int32) UNFIXH(tmp26 * OCOS_3_16 - tmp25 * OSIN_3_16);
      
      inptr += DCTSIZE;	/* advance inptr to next row */	
                   /*      outptr++;*/		/* advance outptr to next column */
      columncounter++;

    }
				/* END OF PASS TWO */


}

int Dct( int *block, int *coeff)
{

  mp_fwd_dct_fast( *(Block *) &block[0], *(Block *) &coeff[0]);

  return 0; 
}      

#endif /* End of ifnotdef FASTDCT */

#define FASTIDCT 10
#ifdef FASTIDCT

/**********************************************************************
 *
 *	Name:		idct
 *	Description:	Descans zigzag-scanned coefficients and does
 *			inverse dct on 64 coefficients
 *                      single precision floats
 *
 *	Input:		64 coefficients, block for 64 pixels
 *	Returns:    	0
 *	Side effects:	
 *
 *	Date: 930128	Author: Robert.Danielsen@nta.no
 *
 **********************************************************************/

int idct(int *coeff,int *block)
{
  int 	        j1, i, j;
  double 	b[8], b1[8], d[8][8];
  double 	f0=.7071068, f1=.4903926, f2=.4619398, f3=.4157348;
  double	        f4=.3535534;
  double 	f5=.2777851, f6=.1913417, f7=.0975452;
  double 	e, f, g, h;

  /* Horizontal */

  /* Descan coefficients first */

  for (i = 0; i < 8; i++) {
    for (j = 0; j < 8; j++) {
      b[j] = *( coeff + zigzag[i][j]);
    }
    e = b[1] * f7 - b[7] * f1;
    h = b[7] * f7 + b[1] * f1;
    f = b[5] * f3 - b[3] * f5;
    g = b[3] * f3 + b[5] * f5;

    b1[0] = (b[0] + b[4]) * f4;
    b1[1] = (b[0] - b[4]) * f4;
    b1[2] = b[2] * f6 - b[6] * f2;
    b1[3] = b[6] * f6 + b[2] * f2;
    b[4] = e + f;
    b1[5] = e - f;
    b1[6] = h - g;
    b[7] = h + g;
    
    b[5] = (b1[6] - b1[5]) * f0;
    b[6] = (b1[6] + b1[5]) * f0;
    b[0] = b1[0] + b1[3];
    b[1] = b1[1] + b1[2];
    b[2] = b1[1] - b1[2];
    b[3] = b1[0] - b1[3];

    for (j = 0; j < 4; j++) {
      j1 = 7 - j;
      d[i][j] = b[j] + b[j1];
      d[i][j1] = b[j] - b[j1];
    }
  }

  /* Vertical */
  
  for (i = 0; i < 8; i++) {
    for (j = 0; j < 8; j++) {
      b[j] = d[j][i];
    }
    e = b[1] * f7 - b[7] * f1;
    h = b[7] * f7 + b[1] * f1;
    f = b[5] * f3 - b[3] * f5;
    g = b[3] * f3 + b[5] * f5;

    b1[0] = (b[0] + b[4]) * f4;
    b1[1] = (b[0] - b[4]) * f4;
    b1[2] = b[2] * f6 - b[6] * f2;
    b1[3] = b[6] * f6 + b[2] * f2;
    b[4] = e + f;
    b1[5] = e - f;
    b1[6] = h - g;
    b[7] = h + g;

    b[5] = (b1[6] - b1[5]) * f0;
    b[6] = (b1[6] + b1[5]) * f0;
    b[0] = b1[0] + b1[3];
    b[1] = b1[1] + b1[2];
    b[2] = b1[1] - b1[2];
    b[3] = b1[0] - b1[3];

    for (j = 0; j < 4; j++) {
      j1 = 7 - j;
      d[j][i] = b[j] + b[j1];
      d[j1][i] = b[j] - b[j1];
    }
  }

  for (i = 0; i < 8; i++) {
    for (j = 0; j < 8; j++) {
      *(block + i * 8 + j) = mnint(d[i][j]);
    }
  }
  return 0;
}

#elif VERYFASTIDCT
/*
 * tmndecode 
 * Copyright (C) 1995 Telenor R&D
 *                    Karl Olav Lillevold <kol@nta.no>                    
 *
 * based on mpeg2decode, (C) 1994, MPEG Software Simulation Group
 * and mpeg2play, (C) 1994 Stefan Eckart
 *                         <stefan@lis.e-technik.tu-muenchen.de>
 *
 */


/**********************************************************/
/* inverse two dimensional DCT, Chen-Wang algorithm       */
/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)             */
/* 32-bit integer arithmetic (8 bit coefficients)         */
/* 11 mults, 29 adds per DCT                              */
/*                                      sE, 18.8.91       */
/**********************************************************/
/* coefficients extended to 12 bit for IEEE1180-1990      */
/* compliance                           sE,  2.1.94       */
/**********************************************************/

/* this code assumes >> to be a two's-complement arithmetic */
/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2               */


#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
#define W7 565  /* 2048*sqrt(2)*cos(7*pi/16) */

/* private data */
static int iclip[1024]; /* clipping table */
static int *iclp;

/* private prototypes */
static void idctrow(int *blk);
static void idctcol(int *blk);

/* row (horizontal) IDCT
 *
 *           7                       pi         1
 * dst[k] = sum c[l] * src[l] * cos( -- * ( k + - ) * l )
 *          l=0                      8          2
 *
 * where: c[0]    = 128
 *        c[1..7] = 128*sqrt(2)
 */

static void idctrow(int *blk)
{
  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

  /* shortcut */
  if (!((x1 = blk[4]<<11) | (x2 = blk[6]) | (x3 = blk[2]) |
        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3])))
  {
    blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=blk[0]<<3;
    return;
  }

  x0 = (blk[0]<<11) + 128; /* for proper rounding in the fourth stage */

  /* first stage */
  x8 = W7*(x4+x5);
  x4 = x8 + (W1-W7)*x4;
  x5 = x8 - (W1+W7)*x5;
  x8 = W3*(x6+x7);
  x6 = x8 - (W3-W5)*x6;
  x7 = x8 - (W3+W5)*x7;
  
  /* second stage */
  x8 = x0 + x1;
  x0 -= x1;
  x1 = W6*(x3+x2);
  x2 = x1 - (W2+W6)*x2;
  x3 = x1 + (W2-W6)*x3;
  x1 = x4 + x6;
  x4 -= x6;
  x6 = x5 + x7;
  x5 -= x7;
  
  /* third stage */
  x7 = x8 + x3;
  x8 -= x3;
  x3 = x0 + x2;
  x0 -= x2;
  x2 = (181*(x4+x5)+128)>>8;
  x4 = (181*(x4-x5)+128)>>8;
  
  /* fourth stage */
  blk[0] = (x7+x1)>>8;
  blk[1] = (x3+x2)>>8;
  blk[2] = (x0+x4)>>8;
  blk[3] = (x8+x6)>>8;
  blk[4] = (x8-x6)>>8;
  blk[5] = (x0-x4)>>8;
  blk[6] = (x3-x2)>>8;
  blk[7] = (x7-x1)>>8;
}

/* column (vertical) IDCT
 *
 *             7                         pi         1
 * dst[8*k] = sum c[l] * src[8*l] * cos( -- * ( k + - ) * l )
 *            l=0                        8          2
 *
 * where: c[0]    = 1/1024
 *        c[1..7] = (1/1024)*sqrt(2)
 */
__inline__ static void idctcol(int *blk)
{
  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

  /* shortcut */
  if (!((x1 = (blk[32]<<8)) | (x2 = blk[48]) | (x3 = blk[16]) |
        (x4 = blk[8]) | (x5 = blk[56]) | (x6 = blk[40]) | (x7 = blk[24])))
  {
    blk[0]=blk[8]=blk[16]=blk[24]=blk[32]=blk[40]=blk[48]=blk[56]=
      iclp[(blk[0]+32)>>6];
    return;
  }

  x0 = (blk[8*0]<<8) + 8192;

  /* first stage */
  x8 = W7*(x4+x5) + 4;
  x4 = (x8+(W1-W7)*x4)>>3;
  x5 = (x8-(W1+W7)*x5)>>3;
  x8 = W3*(x6+x7) + 4;
  x6 = (x8-(W3-W5)*x6)>>3;
  x7 = (x8-(W3+W5)*x7)>>3;
  
  /* second stage */
  x8 = x0 + x1;
  x0 -= x1;
  x1 = W6*(x3+x2) + 4;
  x2 = (x1-(W2+W6)*x2)>>3;
  x3 = (x1+(W2-W6)*x3)>>3;
  x1 = x4 + x6;
  x4 -= x6;
  x6 = x5 + x7;
  x5 -= x7;
  
  /* third stage */
  x7 = x8 + x3;
  x8 -= x3;
  x3 = x0 + x2;
  x0 -= x2;
  x2 = (181*(x4+x5)+128)>>8;
  x4 = (181*(x4-x5)+128)>>8;
  
  /* fourth stage */
  blk[0] = iclp[(x7+x1)>>14];
  blk[8] = iclp[(x3+x2)>>14];
  blk[16] = iclp[(x0+x4)>>14];
  blk[24] = iclp[(x8+x6)>>14];
  blk[32] = iclp[(x8-x6)>>14];
  blk[40] = iclp[(x0-x4)>>14];
  blk[48] = iclp[(x3-x2)>>14];
  blk[56] = iclp[(x7-x1)>>14];
}

/* two dimensional inverse discrete cosine transform */
int idct(int *coeff, int *block)
{
  int i;
  extern int zigzag[8][8];
  int *block_ptr, *zigzag_ptr;

  block_ptr = block; zigzag_ptr = &(zigzag[0][0]);

  for (i = 0; i < 8; i++) {
    *(block_ptr++) = *(coeff + *(zigzag_ptr++));
    *(block_ptr++) = *(coeff + *(zigzag_ptr++));
    *(block_ptr++) = *(coeff + *(zigzag_ptr++));
    *(block_ptr++) = *(coeff + *(zigzag_ptr++));
    *(block_ptr++) = *(coeff + *(zigzag_ptr++));
    *(block_ptr++) = *(coeff + *(zigzag_ptr++));
    *(block_ptr++) = *(coeff + *(zigzag_ptr++));
    *(block_ptr++) = *(coeff + *(zigzag_ptr++));
    /* WAS:     block[j + i*8] = (int) *( coeff + zigzag[i][j]); */
  }

  for (i=0; i<8; i++)
    idctrow(block+8*i);
  
  for (i=0; i<8; i++)
    idctcol(block+i);

  
  return 0;
}

void init_idct()
{
  int i;

  iclp = iclip+512;
  for (i= -512; i<512; i++)
    iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
}


#else

/*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
 *  direct matrix multiply) Inverse Discrete Cosine Transform
*/


/* Here we use math.h to generate constants.  Compiler results may
   vary a little */


/* private data */

/* cosine transform matrix for 8x1 IDCT */
static double c[8][8];

/* initialize DCT coefficient matrix */

void init_idctref()
{
  int freq, time;
  double scale;

  for (freq=0; freq < 8; freq++)
  {
    scale = (freq == 0) ? sqrt(0.125) : 0.5;
    for (time=0; time<8; time++)
      c[freq][time] = scale*cos((PI/8.0)*freq*(time + 0.5));
  }
}

/* perform IDCT matrix multiply for 8x8 coefficient block */

void idctref(int *coeff, int *block)
{
  int i, j, k, v;
  double partial_product;
  double tmp[64];
  int tmp2[64];
  extern int zigzag[8][8];

  for (i=0; i<8; i++)
    for (j=0; j<8; j++)
      tmp2[j+i*8] = *(coeff + zigzag[i][j]);

  for (i=0; i<8; i++)
    for (j=0; j<8; j++)
    {
      partial_product = 0.0;

      for (k=0; k<8; k++)
        partial_product+= c[k][j]*tmp2[8*i+k];

      tmp[8*i+j] = partial_product;
    }

  /* Transpose operation is integrated into address mapping by switching 
     loop order of i and j */

  for (j=0; j<8; j++)
    for (i=0; i<8; i++)
    {
      partial_product = 0.0;

      for (k=0; k<8; k++)
        partial_product+= c[k][i]*tmp[8*k+j];

      v = floor(partial_product+0.5);
      block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
    }


}
#endif