//------------------------------------------------------------------------------
// h2v2.cpp
// Upsampling/colorspace conversion (H2V2, YCbCr)
// Last updated: June 27, 2000 v0.91
// Copyright (C) 1994-2000 Rich Geldreich
// richgel@voicenet.com
//
// This is a popular case, so it's worth seperating out and optimizing a bit.
// If you compile this module with the Intel Compiler, the MMX version will
// automatically be compiled in.
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//------------------------------------------------------------------------------
#include "jpegdecoder.h"
#ifdef __ICL
#include "mmintrin.h"
#endif
//------------------------------------------------------------------------------
// YCbCr H2V2 (2x2:1:1, 6 blocks per MCU) to 24-bit RGB
// This case is very popular, so it's important that it's fast.
// If this module is compiled with the Intel Compiler the faster
// MMX specific version will also be available.
// FIXME: Create all-asm version, so Intel Compiler isn't needed.
void jpeg_decoder::H2V2Convert(void)
{
  int row = max_mcu_y_size - mcu_lines_left;
  uchar *d0 = scan_line_0;
  uchar *d1 = scan_line_1;
  int16 *y;
  int16 *c;

  if (row < 8)
    y = block_seg[0] + row * 8;
  else
    y = block_seg[2] + (row & 7) * 8;

  c = block_seg[4] + (row >> 1) * 8;

#ifdef __ICL
  if (use_mmx)
  {
    // I make no claims about this code being the fastest
    // possible way to implement this case.
    // It should be possible to use MMX multiplies instead
    // of table lookups to perform the YCbCr->RGB conversion.
    // Reorder the calculations to more effectively take advantage of MMX?

    for (int i = max_mcus_per_row; i > 0; i--)
    {
      for (int l = 0; l < 2; l++)
      {
        for (int j = 0; j < 8; j += 2)
        {
          int cb = c[0];
          int cr = c[64];

          int rc = crr[cr];
          int gc = ((crg[cr] + cbg[cb]) >> 16);
          int bc = cbb[cb];

          __m64 m0,m1,m2,m3;

          m0 = _m_from_int(bc);
          m0 = _m_psllqi(m0, 32);
          m1 = _m_from_int(rc | (gc << 16));
          m0 = _m_por(m0, m1);

          m2 = _m_from_int(*(int *)(y + j));
          m3 = _m_punpcklwd(m2, m2);
          m3 = _m_punpckldq(m3, m3);

          m3 = _m_paddsw(m3, m0);
          m3 = _m_packuswb(m3, m3);

          *(int *)d0 = _m_to_int(m3);

          m3 = _m_punpcklwd(m2, m2);
          m3 = _m_punpckhdq(m3, m3);

          m3 = _m_paddsw(m3, m0);
          m3 = _m_packuswb(m3, m3);

          *(int *)(d0+3) = _m_to_int(m3);

          m2 = _m_from_int(*(int *)(y + 8 + j));
          m3 = _m_punpcklwd(m2, m2);
          m3 = _m_punpckldq(m3, m3);

          m3 = _m_paddsw(m3, m0);
          m3 = _m_packuswb(m3, m3);

          *(int *)d1 = _m_to_int(m3);

          m3 = _m_punpcklwd(m2, m2);
          m3 = _m_punpckhdq(m3, m3);

          m3 = _m_paddsw(m3, m0);
          m3 = _m_packuswb(m3, m3);

          *(int *)(d1+3) = _m_to_int(m3);

          d0 += 6;
          d1 += 6;

          c++;
        }
        y += 64;
      }

      y += 64*6 - 64*2;
      c += 64*6 - 8;
    }

    _m_empty();
  }
  else
  {
#endif

  for (int i = max_mcus_per_row; i > 0; i--)
  {
    for (int l = 0; l < 2; l++)
    {
      for (int j = 0; j < 8; j += 2)
      {
        int cb = c[0];
        int cr = c[64];

        int rc = crr[cr];
        int gc = ((crg[cr] + cbg[cb]) >> 16);
        int bc = cbb[cb];

        // FIXME: Is it really faster to load a DWORD at a time?

        int y2 = *(int *)(y+j);
        int16 yy = y2 & 0xFFFF;
        d0[0] = clamp(yy+rc);
        d0[1] = clamp(yy+gc);
        d0[2] = clamp(yy+bc);

        yy = y2 >> 16;
        d0[3] = clamp(yy+rc);
        d0[4] = clamp(yy+gc);
        d0[5] = clamp(yy+bc);

        y2 = *(int *)(y+8+j);
        yy = y2 & 0xFFFF;
        d1[0] = clamp(yy+rc);
        d1[1] = clamp(yy+gc);
        d1[2] = clamp(yy+bc);

        yy = y2 >> 16;
        d1[3] = clamp(yy+rc);
        d1[4] = clamp(yy+gc);
        d1[5] = clamp(yy+bc);

        d0 += 6;
        d1 += 6;

        c++;
      }
      y += 64;
    }

    y += 64*6 - 64*2;
    c += 64*6 - 8;
  }

#ifdef __ICL
  }
#endif

}

