(view as text)
diff --git a/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp b/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp
index f653dfb..f4ba979 100644
--- a/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp
+++ b/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp
@@ -44,6 +44,8 @@ static GLuint s_encode_VBO = 0;
 static GLuint s_encode_VAO = 0;
 static TargetRectangle s_cached_sourceRc;
 
+static GLuint s_PBO = 0; // for readback with different strides
+
 static const char *VProgram =
 	"ATTRIN vec2 rawpos;\n"
 	"ATTRIN vec2 tex0;\n"
@@ -185,6 +187,8 @@ void Init()
 	glBindTexture(GL_TEXTURE_2D, s_dstTexture);
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
 	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, renderBufferWidth, renderBufferHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+	
+	glGenBuffers(1, &s_PBO);
 
 	CreatePrograms();
 }
@@ -196,6 +200,7 @@ void Shutdown()
 	glDeleteFramebuffers(1, &s_texConvFrameBuffer);
 	glDeleteBuffers(1, &s_encode_VBO );
 	glDeleteVertexArrays(1, &s_encode_VAO );
+	glDeleteBuffers(1, &s_PBO);
 
 	s_rgbToYuyvProgram.Destroy();
 	s_yuyvToRgbProgram.Destroy();
@@ -206,6 +211,7 @@ void Shutdown()
 	s_srcTexture = 0;
 	s_dstTexture = 0;
 	s_texConvFrameBuffer = 0;
+	s_PBO = 0;
 }
 
 void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
@@ -267,25 +273,38 @@ void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
 	// TODO: make this less slow.
 
 	int writeStride = bpmem.copyMipMapStrideChannels * 32;
+	int readHeight = readStride / dstWidth / 4; // 4 bytes per pixel
+	int readLoops = dstHeight / readHeight;
 
-	if (writeStride != readStride && toTexture)
+	if (writeStride != readStride && readLoops > 1 && toTexture)
 	{
 		// writing to a texture of a different size
-
-		int readHeight = readStride / dstWidth;
-		readHeight /= 4; // 4 bytes per pixel
-
-		int readStart = 0;
-		int readLoops = dstHeight / readHeight;
+		// also copy more then one block line, so the different strides matters
+		
+		// copy into one pbo first, map this buffer, and then memcpy into gc memory
+		// in this way, we only have one vram->ram transfer, but maybe a bigger
+		// cpu overhead because of the pbo
+		
+		glBindBuffer(GL_PIXEL_PACK_BUFFER, s_PBO);
+		glBufferData(GL_PIXEL_PACK_BUFFER, dstWidth*dstHeight*4, NULL, GL_STREAM_READ);
+		glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, 0);
+		u8* pbo = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
+
+		//int readStart = 0;
 		for (int i = 0; i < readLoops; i++)
 		{
-			glReadPixels(0, readStart, (GLsizei)dstWidth, (GLsizei)readHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
-			readStart += readHeight;
+			memcpy(destAddr, pbo, readStride);
+			pbo += readStride;
 			destAddr += writeStride;
 		}
+
+		glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+		glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
 	}
 	else
+	{
 		glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
+	}
 
 	GL_REPORT_ERRORD();