http://bugs.gentoo.org/306661

backport shader based upscalers from svn trunk

--- language/English/strings.xml
+++ language/English/strings.xml
@@ -1554,16 +1554,17 @@
   <string id="16304">Lanczos2</string>
   <string id="16305">Lanczos3</string>
   <string id="16306">Sinc8</string>
-
   <string id="16307">Bicubic (software)</string>
   <string id="16308">Lanczos (software)</string>
   <string id="16309">Sinc (software)</string>
-
   <string id="16310">(VDPAU)Temporal</string>
   <string id="16311">(VDPAU)Temporal/Spatial</string>
   <string id="16312">(VDPAU)Noise Reduction</string>
   <string id="16313">(VDPAU)Sharpness</string>
   <string id="16314">Inverse Telecine</string>
+  <string id="16315">Lanczos3 optimized</string>
+  <string id="16316">Auto</string>
+
   <string id="17500">Display sleep timeout</string>
 
   <string id="19000">Switch to channel</string>
--- system/shaders/convolution-6x6.glsl
+++ system/shaders/convolution-6x6.glsl
@@ -0,0 +1,69 @@
+uniform sampler2D img;
+uniform float     stepx;
+uniform float     stepy;
+
+#if (HAS_FLOAT_TEXTURE)
+uniform sampler1D kernelTex;
+
+vec3 weight(float pos)
+{
+  return texture1D(kernelTex, pos).rgb;
+}
+#else
+uniform sampler2D kernelTex;
+
+vec3 weight(float pos)
+{
+  //row 0 contains the high byte, row 1 contains the low byte
+  return ((texture2D(kernelTex, vec2(pos, 0.0)) * 256.0 + texture2D(kernelTex, vec2(pos, 1.0)))).rgb / 128.5 - 1.0;
+}
+#endif
+
+vec3 pixel(float xpos, float ypos)
+{
+  return texture2D(img, vec2(xpos, ypos)).rgb;
+}
+
+vec3 line (float ypos, vec3 xpos1, vec3 xpos2, vec3 linetaps1, vec3 linetaps2)
+{
+  vec3  pixels;
+
+  pixels  = pixel(xpos1.r, ypos) * linetaps1.r;
+  pixels += pixel(xpos1.g, ypos) * linetaps2.r;
+  pixels += pixel(xpos1.b, ypos) * linetaps1.g;
+  pixels += pixel(xpos2.r, ypos) * linetaps2.g;
+  pixels += pixel(xpos2.g, ypos) * linetaps1.b;
+  pixels += pixel(xpos2.b, ypos) * linetaps2.b;
+
+  return pixels;
+}
+
+void main()
+{
+  float xf = fract(gl_TexCoord[0].x / stepx);
+  float yf = fract(gl_TexCoord[0].y / stepy);
+
+  vec3 linetaps1   = weight((1.0 - xf) / 2.0);
+  vec3 linetaps2   = weight((1.0 - xf) / 2.0 + 0.5);
+  vec3 columntaps1 = weight((1.0 - yf) / 2.0);
+  vec3 columntaps2 = weight((1.0 - yf) / 2.0 + 0.5);
+
+  vec3 xpos1 = vec3(
+      (-1.5 - xf) * stepx + gl_TexCoord[0].x,
+      (-0.5 - xf) * stepx + gl_TexCoord[0].x,
+      ( 0.5 - xf) * stepx + gl_TexCoord[0].x);
+  vec3 xpos2 = vec3(
+      ( 1.5 - xf) * stepx + gl_TexCoord[0].x,
+      ( 2.5 - xf) * stepx + gl_TexCoord[0].x,
+      ( 3.5 - xf) * stepx + gl_TexCoord[0].x);
+
+  gl_FragColor.rgb  = line((-1.5 - yf) * stepy + gl_TexCoord[0].y, xpos1, xpos2, linetaps1, linetaps2) * columntaps1.r;
+  gl_FragColor.rgb += line((-0.5 - yf) * stepy + gl_TexCoord[0].y, xpos1, xpos2, linetaps1, linetaps2) * columntaps2.r;
+  gl_FragColor.rgb += line(( 0.5 - yf) * stepy + gl_TexCoord[0].y, xpos1, xpos2, linetaps1, linetaps2) * columntaps1.g;
+  gl_FragColor.rgb += line(( 1.5 - yf) * stepy + gl_TexCoord[0].y, xpos1, xpos2, linetaps1, linetaps2) * columntaps2.g;
+  gl_FragColor.rgb += line(( 2.5 - yf) * stepy + gl_TexCoord[0].y, xpos1, xpos2, linetaps1, linetaps2) * columntaps1.b;
+  gl_FragColor.rgb += line(( 3.5 - yf) * stepy + gl_TexCoord[0].y, xpos1, xpos2, linetaps1, linetaps2) * columntaps2.b;
+
+  gl_FragColor.a = gl_Color.a;
+}
+
--- system/shaders/bicubic.glsl
+++ system/shaders/bicubic.glsl
@@ -0,0 +1,47 @@
+uniform sampler2D img;
+uniform float stepx;
+uniform float stepy;
+uniform sampler2D kernelTex;
+
+vec4 cubicFilter(float xValue, vec4 c0, vec4 c1, vec4 c2, vec4 c3)
+{
+  vec4 h = texture2D(kernelTex, vec2(xValue, 0.5));
+  vec4 r = c0 * h.r;
+  r += c1 * h.g;
+  r += c2 * h.b;
+  r += c3 * h.a;
+  return r;
+}
+
+void main()
+{
+  vec2 f = vec2(gl_TexCoord[0].x / stepx , gl_TexCoord[0].y / stepy);
+  f = fract(f);
+  vec4 t0 = cubicFilter(f.x,
+  texture2D(img, gl_TexCoord[0].xy + vec2(-stepx,    -stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(0.0,       -stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(stepx,     -stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(2.0*stepx, -stepy)));
+
+  vec4 t1 = cubicFilter(f.x,
+  texture2D(img, gl_TexCoord[0].xy + vec2(-stepx,    0.0)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(0.0,       0.0)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(stepx,     0.0)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(2.0*stepx, 0.0)));
+
+  vec4 t2 = cubicFilter(f.x,
+  texture2D(img, gl_TexCoord[0].xy + vec2(-stepx,    stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(0.0,       stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(stepx,     stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(2.0*stepx, stepy)));
+
+  vec4 t3 = cubicFilter(f.x,
+  texture2D(img, gl_TexCoord[0].xy + vec2(-stepx,    2.0*stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(0,         2.0*stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(stepx,     2.0*stepy)),
+  texture2D(img, gl_TexCoord[0].xy + vec2(2.0*stepx, 2.0*stepy)));
+
+  gl_FragColor = cubicFilter(f.y, t0, t1, t2, t3);   
+  gl_FragColor.a = gl_Color.a;
+}
+
--- system/shaders/convolution-4x4.glsl
+++ system/shaders/convolution-4x4.glsl
@@ -0,0 +1,60 @@
+uniform sampler2D img;
+uniform float     stepx;
+uniform float     stepy;
+
+#if (HAS_FLOAT_TEXTURE)
+uniform sampler1D kernelTex;
+
+vec4 weight(float pos)
+{
+  return texture1D(kernelTex, pos);
+}
+#else
+uniform sampler2D kernelTex;
+
+vec4 weight(float pos)
+{
+  //row 0 contains the high byte, row 1 contains the low byte
+  return (texture2D(kernelTex, vec2(pos, 0.0)) * 256.0 + texture2D(kernelTex, vec2(pos, 1.0))) / 128.5 - 1.0;
+}
+#endif
+
+vec3 pixel(float xpos, float ypos)
+{
+  return texture2D(img, vec2(xpos, ypos)).rgb;
+}
+
+vec3 line (float ypos, vec4 xpos, vec4 linetaps)
+{
+  vec3  pixels;
+
+  pixels  = pixel(xpos.r, ypos) * linetaps.r;
+  pixels += pixel(xpos.g, ypos) * linetaps.g;
+  pixels += pixel(xpos.b, ypos) * linetaps.b;
+  pixels += pixel(xpos.a, ypos) * linetaps.a;
+
+  return pixels;
+}
+
+void main()
+{
+  float xf = fract(gl_TexCoord[0].x / stepx);
+  float yf = fract(gl_TexCoord[0].y / stepy);
+
+  vec4 linetaps   = weight(1.0 - xf);
+  vec4 columntaps = weight(1.0 - yf);
+
+  vec4 xpos = vec4(
+      (-0.5 - xf) * stepx + gl_TexCoord[0].x,
+      ( 0.5 - xf) * stepx + gl_TexCoord[0].x,
+      ( 1.5 - xf) * stepx + gl_TexCoord[0].x,
+      ( 2.5 - xf) * stepx + gl_TexCoord[0].x);
+
+  gl_FragColor.rgb  = line((-0.5 - yf) * stepy + gl_TexCoord[0].y, xpos, linetaps) * columntaps.r;
+  gl_FragColor.rgb += line(( 0.5 - yf) * stepy + gl_TexCoord[0].y, xpos, linetaps) * columntaps.g;
+  gl_FragColor.rgb += line(( 1.5 - yf) * stepy + gl_TexCoord[0].y, xpos, linetaps) * columntaps.b;
+  gl_FragColor.rgb += line(( 2.5 - yf) * stepy + gl_TexCoord[0].y, xpos, linetaps) * columntaps.a;
+
+  gl_FragColor.a = gl_Color.a;
+}
+
--- xbmc/settings/VideoSettings.h
+++ xbmc/settings/VideoSettings.h
@@ -51,9 +51,10 @@
 {
   VS_SCALINGMETHOD_NEAREST=0,
   VS_SCALINGMETHOD_LINEAR,
-  
+
   VS_SCALINGMETHOD_CUBIC,
   VS_SCALINGMETHOD_LANCZOS2,
+  VS_SCALINGMETHOD_LANCZOS3_FAST,
   VS_SCALINGMETHOD_LANCZOS3,
   VS_SCALINGMETHOD_SINC8,
   VS_SCALINGMETHOD_NEDI,
@@ -61,7 +62,9 @@
   VS_SCALINGMETHOD_BICUBIC_SOFTWARE,
   VS_SCALINGMETHOD_LANCZOS_SOFTWARE,
   VS_SCALINGMETHOD_SINC_SOFTWARE,
-  VS_SCALINGMETHOD_VDPAU_HARDWARE
+  VS_SCALINGMETHOD_VDPAU_HARDWARE,
+
+  VS_SCALINGMETHOD_AUTO
 };
 
 class CVideoSettings
--- xbmc/cores/VideoRenderers/VideoShaders/VideoFilterShader.cpp
+++ xbmc/cores/VideoRenderers/VideoShaders/VideoFilterShader.cpp
@@ -21,6 +21,7 @@
 #include "system.h"
 #include "VideoFilterShader.h"
 #include "utils/log.h"
+#include "ConvolutionKernels.h"
 
 #include <string>
 #include <math.h>
@@ -63,60 +64,13 @@
 
 BicubicFilterShader::BicubicFilterShader(float B, float C)
 {
-  string shaderf = 
-    "uniform sampler2D img;"
-    "uniform float stepx;"
-    "uniform float stepy;"
-    "uniform sampler2D kernelTex;"
-    
-    "vec4 cubicFilter(float xValue, vec4 c0, vec4 c1, vec4 c2, vec4 c3)"
-    "{"
-    " vec4 h = texture2D(kernelTex, vec2(xValue, 0.5));"
-    " vec4 r = c0 * h.r;"
-    " r += c1 * h.g;"
-    " r += c2 * h.b;"
-    " r += c3 * h.a;"
-    " return r;"
-    "}"
-    ""
-    "void main()"
-    "{"
-    "vec2 f = vec2(gl_TexCoord[0].x / stepx , gl_TexCoord[0].y / stepy);"
-    "f = fract(f);"
-    "vec4 t0 = cubicFilter(f.x,"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(-stepx, -stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(0.0, -stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(stepx, -stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(2.0*stepx, -stepy)));"
-    ""
-    "vec4 t1 = cubicFilter(f.x,"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(-stepx, 0.0)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(0.0, 0.0)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(stepx, 0.0)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(2.0*stepx, 0.0)));"
-    ""
-    "vec4 t2 = cubicFilter(f.x,"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(-stepx, stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(0.0, stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(stepx, stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(2.0*stepx, stepy)));"
-    ""
-    "vec4 t3 = cubicFilter(f.x,"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(-stepx, 2.0*stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(0, 2.0*stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(stepx, 2.0*stepy)),"
-    "texture2D(img, gl_TexCoord[0].xy + vec2(2.0*stepx, 2.0*stepy)));"
-    
-    "gl_FragColor = cubicFilter(f.y, t0, t1, t2, t3) ;"    
-    "gl_FragColor.a = gl_Color.a;"
-    "}";
-  PixelShader()->SetSource(shaderf);
+  PixelShader()->LoadSource("bicubic.glsl");
   m_kernelTex1 = 0;
   m_B = B;
   m_C = C;
-  if (B<=0)
+  if (B<0)
     m_B=1.0f/3.0f;
-  if (C<=0)
+  if (C<0)
     m_C=1.0f/3.0f;
 }
 
@@ -209,8 +163,8 @@
   glBindTexture(GL_TEXTURE_2D, m_kernelTex1);
   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
   glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, size, 1, 0, GL_RGBA, GL_FLOAT, img);
 
   glActiveTexture(GL_TEXTURE0);
@@ -254,4 +208,110 @@
   return val;
 }
 
+ConvolutionFilterShader::ConvolutionFilterShader(ESCALINGMETHOD method)
+{
+  m_method = method;
+  m_kernelTex1 = 0;
+
+  string shadername;
+  string defines;
+
+  if (m_method == VS_SCALINGMETHOD_CUBIC ||
+      m_method == VS_SCALINGMETHOD_LANCZOS2 ||
+      m_method == VS_SCALINGMETHOD_LANCZOS3_FAST)
+    shadername = "convolution-4x4.glsl";
+  else if (m_method == VS_SCALINGMETHOD_LANCZOS3)
+    shadername = "convolution-6x6.glsl";
+
+  m_floattex = glewIsSupported("GL_ARB_texture_float");
+
+  if (m_floattex)
+    defines = "#define HAS_FLOAT_TEXTURE 1\n";
+  else
+    defines = "#define HAS_FLOAT_TEXTURE 0\n";
+
+  CLog::Log(LOGDEBUG, "GL: ConvolutionFilterShader: using %s defines: %s", shadername.c_str(), defines.c_str());
+  PixelShader()->LoadSource(shadername, defines);
+}
+
+void ConvolutionFilterShader::OnCompiledAndLinked()
+{
+  // obtain shader attribute handles on successfull compilation
+  m_hSourceTex = glGetUniformLocation(ProgramHandle(), "img");
+  m_hStepX     = glGetUniformLocation(ProgramHandle(), "stepx");
+  m_hStepY     = glGetUniformLocation(ProgramHandle(), "stepy");
+  m_hKernTex   = glGetUniformLocation(ProgramHandle(), "kernelTex");
+
+  CConvolutionKernel kernel(m_method, 256);
+
+  if (m_kernelTex1)
+  {
+    glDeleteTextures(1, &m_kernelTex1);
+    m_kernelTex1 = 0;
+  }
+
+  glGenTextures(1, &m_kernelTex1);
+
+  if ((m_kernelTex1<=0))
+  {
+    CLog::Log(LOGERROR, "GL: ConvolutionFilterShader: Error creating kernel texture");
+    return;
+  }
+
+  glActiveTexture(GL_TEXTURE2);
+
+  //if float textures are supported, we can load the kernel as a 1d float texture
+  //if not, we load it as a 2d texture with 2 rows, where row 0 contains the high byte
+  //and row 1 contains the low byte, which can be converted in the shader
+  if (m_floattex)
+  {
+    glBindTexture(GL_TEXTURE_1D, m_kernelTex1);
+    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    glTexImage1D(GL_TEXTURE_1D, 0, GL_RGBA16F_ARB, kernel.GetSize(), 0, GL_RGBA, GL_FLOAT, kernel.GetFloatPixels());
+  }
+  else
+  {
+    glBindTexture(GL_TEXTURE_2D, m_kernelTex1);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, kernel.GetSize(), 2, 0, GL_RGBA, GL_UNSIGNED_BYTE, kernel.GetIntFractPixels());
+  }
+
+  glActiveTexture(GL_TEXTURE0);
+
+  VerifyGLState();
+}
+
+bool ConvolutionFilterShader::OnEnabled()
+{
+  // set shader attributes once enabled
+  glActiveTexture(GL_TEXTURE2);
+
+  if (m_floattex)
+    glBindTexture(GL_TEXTURE_1D, m_kernelTex1);
+  else
+    glBindTexture(GL_TEXTURE_2D, m_kernelTex1);
+
+  glActiveTexture(GL_TEXTURE0);
+  glUniform1i(m_hSourceTex, m_sourceTexUnit);
+  glUniform1i(m_hKernTex, 2);
+  glUniform1f(m_hStepX, m_stepX);
+  glUniform1f(m_hStepY, m_stepY);
+  VerifyGLState();
+  return true;
+}
+
+void ConvolutionFilterShader::Free()
+{
+  if (m_kernelTex1)
+    glDeleteTextures(1, &m_kernelTex1);
+  m_kernelTex1 = 0;
+  BaseVideoFilterShader::Free();
+}
+
 #endif
--- xbmc/cores/VideoRenderers/VideoShaders/ConvolutionKernels.cpp
+++ xbmc/cores/VideoRenderers/VideoShaders/ConvolutionKernels.cpp
@@ -0,0 +1,226 @@
+/*
+ *      Copyright (C) 2005-2008 Team XBMC
+ *      http://www.xbmc.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with XBMC; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+#ifdef _WIN32
+  #define _USE_MATH_DEFINES
+#endif
+
+#include "ConvolutionKernels.h"
+#include "MathUtils.h"
+
+#define SINC(x) (sin(M_PI * (x)) / (M_PI * (x)))
+
+CConvolutionKernel::CConvolutionKernel(ESCALINGMETHOD method, int size)
+{
+  m_size = size;
+  m_floatpixels = new float[m_size * 4];
+
+  if (method == VS_SCALINGMETHOD_LANCZOS2)
+    Lanczos2();
+  else if (method == VS_SCALINGMETHOD_LANCZOS3_FAST)
+    Lanczos3Fast();
+  else if (method == VS_SCALINGMETHOD_LANCZOS3)
+    Lanczos3();
+  else if (method == VS_SCALINGMETHOD_CUBIC) 
+    Bicubic(1.0 / 3.0, 1.0 / 3.0);
+
+  ToIntFract();
+}
+
+CConvolutionKernel::~CConvolutionKernel()
+{
+  delete [] m_floatpixels;
+  delete [] m_intfractpixels;
+}
+
+//generate a lanczos2 kernel which can be loaded with RGBA format
+//each value of RGBA has one tap, so a shader can load 4 taps with a single pixel lookup
+void CConvolutionKernel::Lanczos2()
+{
+  for (int i = 0; i < m_size; i++)
+  {
+    double x = (double)i / (double)m_size;
+
+    //generate taps
+    for (int j = 0; j < 4; j++)
+      m_floatpixels[i * 4 + j] = (float)LanczosWeight(x + (double)(j - 2), 2.0);
+
+    //any collection of 4 taps added together needs to be exactly 1.0
+    //for lanczos this is not always the case, so we take each collection of 4 taps
+    //and divide those taps by the sum of the taps
+    float weight = 0.0;
+    for (int j = 0; j < 4; j++)
+      weight += m_floatpixels[i * 4 + j];
+
+    for (int j = 0; j < 4; j++)
+      m_floatpixels[i * 4 + j] /= weight;
+  }
+}
+
+//generate a lanczos3 kernel which can be loaded with RGBA format
+//each value of RGBA has one tap, so a shader can load 4 taps with a single pixel lookup
+//the two outer lobes of the lanczos3 kernel are added to the two lobes one step to the middle
+//this basically looks the same as lanczos3, but the kernel only has 4 taps,
+//so it can use the 4x4 convolution shader which is twice as fast as the 6x6 one
+void CConvolutionKernel::Lanczos3Fast()
+{
+  for (int i = 0; i < m_size; i++)
+  {
+    double a = 3.0;
+    double x = (double)i / (double)m_size;
+
+    //generate taps
+    m_floatpixels[i * 4 + 0] = (float)(LanczosWeight(x - 2.0, a) + LanczosWeight(x - 3.0, a));
+    m_floatpixels[i * 4 + 1] = (float) LanczosWeight(x - 1.0, a);
+    m_floatpixels[i * 4 + 2] = (float) LanczosWeight(x      , a);
+    m_floatpixels[i * 4 + 3] = (float)(LanczosWeight(x + 1.0, a) + LanczosWeight(x + 2.0, a));
+
+    //any collection of 4 taps added together needs to be exactly 1.0
+    //for lanczos this is not always the case, so we take each collection of 4 taps
+    //and divide those taps by the sum of the taps
+    float weight = 0.0;
+    for (int j = 0; j < 4; j++)
+      weight += m_floatpixels[i * 4 + j];
+
+    for (int j = 0; j < 4; j++)
+      m_floatpixels[i * 4 + j] /= weight;
+  }
+}
+
+//generate a lanczos3 kernel which can be loaded with RGBA format
+//each value of RGB has one tap, so a shader can load 3 taps with a single pixel lookup
+void CConvolutionKernel::Lanczos3()
+{
+  for (int i = 0; i < m_size; i++)
+  {
+    double x = (double)i / (double)m_size;
+
+    //generate taps
+    for (int j = 0; j < 3; j++)
+      m_floatpixels[i * 4 + j] = (float)LanczosWeight(x * 2.0 + (double)(j * 2 - 3), 3.0);
+
+    m_floatpixels[i * 4 + 3] = 0.0;
+  }
+
+  //any collection of 6 taps added together needs to be exactly 1.0
+  //for lanczos this is not always the case, so we take each collection of 6 taps
+  //and divide those taps by the sum of the taps
+  for (int i = 0; i < m_size / 2; i++)
+  {
+    float weight = 0.0;
+    for (int j = 0; j < 3; j++)
+    {
+      weight += m_floatpixels[i * 4 + j];
+      weight += m_floatpixels[(i + m_size / 2) * 4 + j];
+    }
+    for (int j = 0; j < 3; j++)
+    {
+      m_floatpixels[i * 4 + j] /= weight;
+      m_floatpixels[(i + m_size / 2) * 4 + j] /= weight;
+    }
+  }
+}
+
+//generate a bicubic kernel which can be loaded with RGBA format
+//each value of RGBA has one tap, so a shader can load 4 taps with a single pixel lookup
+void CConvolutionKernel::Bicubic(double B, double C)
+{
+  for (int i = 0; i < m_size; i++)
+  {
+    double x = (double)i / (double)m_size;
+
+    //generate taps
+    for (int j = 0; j < 4; j++)
+      m_floatpixels[i * 4 + j] = (float)BicubicWeight(x + (double)(j - 2), B, C);
+  }
+}
+
+double CConvolutionKernel::LanczosWeight(double x, double radius)
+{
+  double ax = fabs(x);
+
+  if (ax == 0.0)
+    return 1.0;
+  else if (ax < radius)
+    return SINC(ax) * SINC(ax / radius);
+  else
+    return 0.0;
+}
+
+double CConvolutionKernel::BicubicWeight(double x, double B, double C)
+{
+  double ax = fabs(x);
+
+  if (ax<1.0)
+  {
+    return ((12 - 9*B - 6*C) * ax * ax * ax +
+            (-18 + 12*B + 6*C) * ax * ax +
+            (6 - 2*B))/6;
+  }
+  else if (ax<2.0)
+  {
+    return ((-B - 6*C) * ax * ax * ax + 
+            (6*B + 30*C) * ax * ax + (-12*B - 48*C) * 
+             ax + (8*B + 24*C)) / 6;
+  }
+  else
+  {
+    return 0.0;
+  }
+}
+
+
+//convert float to high byte/low byte, so the kernel can be loaded into an 8 bit texture
+//with height 2 and converted back to real float in the shader
+//it only works when the kernel texture uses nearest neighbour, but there's almost no difference
+//between that and linear interpolation
+void CConvolutionKernel::ToIntFract()
+{
+  m_intfractpixels = new uint8_t[m_size * 8];
+
+  for (int i = 0; i < m_size * 4; i++)
+  {
+    int value = MathUtils::round_int((m_floatpixels[i] + 1.0) / 2.0 * 65535.0);
+    if (value < 0)
+      value = 0;
+    else if (value > 65535)
+      value = 65535;
+    
+    int integer = value / 256;
+    int fract   = value % 256;
+
+    m_intfractpixels[i] = (uint8_t)integer;
+    m_intfractpixels[i + m_size * 4] = (uint8_t)fract;
+  }
+
+#if 0
+  for (int i = 0; i < 4; i++)
+  {
+    for (int j = 0; j < m_size; j++)
+    {
+      printf("%i %f %f\n",
+          i * m_size + j,
+          ((double)m_intfractpixels[j * 4 + i] + (double)m_intfractpixels[j * 4 + i + m_size * 4] / 255.0) / 255.0 * 2.0 - 1.0,
+          m_floatpixels[j * 4 + i]);
+    }
+  }
+#endif
+}
+
--- xbmc/cores/VideoRenderers/VideoShaders/VideoFilterShader.h
+++ xbmc/cores/VideoRenderers/VideoShaders/VideoFilterShader.h
@@ -4,6 +4,7 @@
 #ifdef HAS_GL
 
 #include "../../../../guilib/Shader.h"
+#include "../../../settings/VideoSettings.h"
 
 using namespace Shaders;
 
@@ -35,7 +36,7 @@
   class BicubicFilterShader : public BaseVideoFilterShader
   {
   public:
-    BicubicFilterShader(float B=0.0f, float C=0.0f);
+    BicubicFilterShader(float B=-1.0f, float C=-1.0f);
     void OnCompiledAndLinked();
     bool OnEnabled();
     void Free();
@@ -55,6 +56,25 @@
     float m_C;
   };
 
+  class ConvolutionFilterShader : public BaseVideoFilterShader
+  {
+  public:
+    ConvolutionFilterShader(ESCALINGMETHOD method);
+    void OnCompiledAndLinked();
+    bool OnEnabled();
+    void Free();
+
+  protected:
+    // kernel textures
+    GLuint m_kernelTex1;
+
+    // shader handles to kernel textures
+    GLint m_hKernTex;
+
+    ESCALINGMETHOD m_method;
+    bool           m_floattex; //if float textures are supported
+  };
+
 } // end namespace
 
 #endif
--- xbmc/cores/VideoRenderers/VideoShaders/ConvolutionKernels.h
+++ xbmc/cores/VideoRenderers/VideoShaders/ConvolutionKernels.h
@@ -0,0 +1,55 @@
+/*
+ *      Copyright (C) 2005-2008 Team XBMC
+ *      http://www.xbmc.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with XBMC; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#ifndef CONVOLUTIONKERNELS
+#define CONVOLUTIONKERNELS
+
+#include "system.h"
+#include "../../../settings/VideoSettings.h"
+
+class CConvolutionKernel
+{
+  public:
+    CConvolutionKernel(ESCALINGMETHOD method, int size);
+    ~CConvolutionKernel();
+
+    int      GetSize()           { return m_size; }
+    float*   GetFloatPixels()    { return m_floatpixels; }
+    uint8_t* GetIntFractPixels() { return m_intfractpixels; }
+
+  private:
+
+    void Lanczos2();
+    void Lanczos3Fast();
+    void Lanczos3();
+    void Bicubic(double B, double C);
+
+    double LanczosWeight(double x, double radius);
+    double BicubicWeight(double x, double B, double C);
+
+    void ToIntFract();
+
+    int      m_size;
+    float*   m_floatpixels;
+    uint8_t* m_intfractpixels;
+};
+
+#endif //CONVOLUTIONKERNELS
--- xbmc/cores/VideoRenderers/VideoShaders/Makefile
+++ xbmc/cores/VideoRenderers/VideoShaders/Makefile
@@ -1,5 +1,5 @@
 INCLUDES=-I. -I.. -I../../ -I../../../ -I../../../linux -I../../../../guilib
-SRCS=YUV2RGBShader.cpp VideoFilterShader.cpp
+SRCS=YUV2RGBShader.cpp VideoFilterShader.cpp ConvolutionKernels.cpp
 
 LIB=VideoShaders.a
 
--- xbmc/cores/VideoRenderers/LinuxRendererGL.cpp
+++ xbmc/cores/VideoRenderers/LinuxRendererGL.cpp
@@ -886,6 +886,19 @@
 
   VerifyGLState();
 
+  if (m_scalingMethod == VS_SCALINGMETHOD_AUTO)
+  {
+    bool scaleSD = (int)m_sourceWidth < m_upscalingWidth && (int)m_sourceHeight < m_upscalingHeight &&
+                   m_sourceHeight < 720 && m_sourceWidth < 1280;
+
+    if (Supports(VS_SCALINGMETHOD_VDPAU_HARDWARE))
+      m_scalingMethod = VS_SCALINGMETHOD_VDPAU_HARDWARE;
+    else if (Supports(VS_SCALINGMETHOD_LANCZOS3_FAST) && scaleSD)
+      m_scalingMethod = VS_SCALINGMETHOD_LANCZOS3_FAST;
+    else
+      m_scalingMethod = VS_SCALINGMETHOD_LINEAR;
+  }
+
   switch (m_scalingMethod)
   {
   case VS_SCALINGMETHOD_NEAREST:
@@ -898,13 +911,10 @@
     m_renderQuality = RQ_SINGLEPASS;
     return;
 
+  case VS_SCALINGMETHOD_LANCZOS2:
+  case VS_SCALINGMETHOD_LANCZOS3_FAST:
+  case VS_SCALINGMETHOD_LANCZOS3:
   case VS_SCALINGMETHOD_CUBIC:
-    if(!glewIsSupported("GL_ARB_texture_float"))
-    {
-      CLog::Log(LOGERROR, "GL: hardware doesn't support GL_ARB_texture_float");
-      break;
-    }
-
     if (!m_fbo.Initialize())
     {
       CLog::Log(LOGERROR, "GL: Error initializing FBO");
@@ -917,7 +927,7 @@
       break;
     }
 
-    m_pVideoFilterShader = new BicubicFilterShader(0.3f, 0.3f);
+    m_pVideoFilterShader = new ConvolutionFilterShader(m_scalingMethod);
     if (!m_pVideoFilterShader->CompileAndLink())
     {
       CLog::Log(LOGERROR, "GL: Error compiling and linking video filter shader");
@@ -928,8 +938,6 @@
     m_renderQuality = RQ_MULTIPASS;
     return;
 
-  case VS_SCALINGMETHOD_LANCZOS2:
-  case VS_SCALINGMETHOD_LANCZOS3:
   case VS_SCALINGMETHOD_SINC8:
   case VS_SCALINGMETHOD_NEDI:
     CLog::Log(LOGERROR, "GL: TODO: This scaler has not yet been implemented");
@@ -1895,16 +1903,19 @@
 bool CLinuxRendererGL::Supports(ESCALINGMETHOD method)
 {
   if(method == VS_SCALINGMETHOD_NEAREST
-  || method == VS_SCALINGMETHOD_LINEAR)
+  || method == VS_SCALINGMETHOD_LINEAR
+  || method == VS_SCALINGMETHOD_AUTO)
     return true;
 
-
-  if(method == VS_SCALINGMETHOD_CUBIC 
-  && glewIsSupported("GL_ARB_texture_float")
-  && glewIsSupported("GL_EXT_framebuffer_object")
-  && m_renderMethod == RENDER_GLSL)
-    return true;
-
+  if(method == VS_SCALINGMETHOD_CUBIC
+  || method == VS_SCALINGMETHOD_LANCZOS2
+  || method == VS_SCALINGMETHOD_LANCZOS3_FAST
+  || method == VS_SCALINGMETHOD_LANCZOS3)
+  {
+    if (glewIsSupported("GL_EXT_framebuffer_object") && (m_renderMethod & RENDER_GLSL))
+      return true;
+  }
+ 
   if (g_advancedSettings.m_videoHighQualityScaling != SOFTWARE_UPSCALING_DISABLED)
   {
     if(method == VS_SCALINGMETHOD_BICUBIC_SOFTWARE
--- xbmc/GUIDialogVideoSettings.cpp
+++ xbmc/GUIDialogVideoSettings.cpp
@@ -103,6 +103,7 @@
     entries.push_back(make_pair(VS_SCALINGMETHOD_LINEAR           , 16302));
     entries.push_back(make_pair(VS_SCALINGMETHOD_CUBIC            , 16303));
     entries.push_back(make_pair(VS_SCALINGMETHOD_LANCZOS2         , 16304));
+    entries.push_back(make_pair(VS_SCALINGMETHOD_LANCZOS3_FAST    , 16315));
     entries.push_back(make_pair(VS_SCALINGMETHOD_LANCZOS3         , 16305));
     entries.push_back(make_pair(VS_SCALINGMETHOD_SINC8            , 16306));
 //    entries.push_back(make_pair(VS_SCALINGMETHOD_NEDI             , ?????));
@@ -110,6 +111,7 @@
     entries.push_back(make_pair(VS_SCALINGMETHOD_LANCZOS_SOFTWARE , 16308));
     entries.push_back(make_pair(VS_SCALINGMETHOD_SINC_SOFTWARE    , 16309));
     entries.push_back(make_pair(VS_SCALINGMETHOD_VDPAU_HARDWARE   , 13120));
+    entries.push_back(make_pair(VS_SCALINGMETHOD_AUTO             , 16316));
 
     /* remove unsupported methods */
     for(vector<pair<int, int> >::iterator it = entries.begin(); it != entries.end();)
--- xbmc/Settings.cpp
+++ xbmc/Settings.cpp
@@ -772,7 +772,7 @@
     GetInteger(pElement, "interlacemethod", interlaceMethod, VS_INTERLACEMETHOD_NONE, VS_INTERLACEMETHOD_NONE, VS_INTERLACEMETHOD_INVERSE_TELECINE);
     m_stSettings.m_defaultVideoSettings.m_InterlaceMethod = (EINTERLACEMETHOD)interlaceMethod;
     int scalingMethod;
-    GetInteger(pElement, "scalingmethod", scalingMethod, VS_SCALINGMETHOD_LINEAR, VS_SCALINGMETHOD_NEAREST, VS_SCALINGMETHOD_CUBIC);
+    GetInteger(pElement, "scalingmethod", scalingMethod, VS_SCALINGMETHOD_LINEAR, VS_SCALINGMETHOD_NEAREST, VS_SCALINGMETHOD_AUTO);
     m_stSettings.m_defaultVideoSettings.m_ScalingMethod = (ESCALINGMETHOD)scalingMethod;
 
     GetInteger(pElement, "viewmode", m_stSettings.m_defaultVideoSettings.m_ViewMode, VIEW_MODE_NORMAL, VIEW_MODE_NORMAL, VIEW_MODE_CUSTOM);