void CPushPinDesktop::CopyScreenToDataBlock(HDC hScrDC, BYTE *pData, BITMAPINFO *pHeader, IMediaSample *pSample)
{
    HDC         hMemDC;         // screen DC and memory DC
    HBITMAP     hOldBitmap;    // handles to device-dependent bitmaps
    int         nX, nY;       // coordinates of rectangle to grab
	int         iFinalStretchHeight = getNegotiatedFinalHeight();
	int         iFinalStretchWidth  = getNegotiatedFinalWidth();
	
    ASSERT(!IsRectEmpty(&m_rScreen)); // that would be unexpected
    // create a DC for the screen and create
    // a memory DC compatible to screen DC   
	
    hMemDC = CreateCompatibleDC(hScrDC); //  0.02ms Anything else to reuse, this one's pretty fast...?

    // determine points of where to grab from it, though I think we control these with m_rScreen
    nX  = m_rScreen.left;
    nY  = m_rScreen.top;

	// sanity checks--except we don't want it apparently, to allow upstream to dynamically change the size? Can it do that?
	ASSERT(m_rScreen.bottom - m_rScreen.top == iFinalStretchHeight);
	ASSERT(m_rScreen.right - m_rScreen.left == iFinalStretchWidth);

    // select new bitmap into memory DC
    hOldBitmap = (HBITMAP) SelectObject(hMemDC, hRawBitmap);

	doJustBitBltOrScaling(hMemDC, m_iCaptureConfigWidth, m_iCaptureConfigHeight, iFinalStretchWidth, iFinalStretchHeight, hScrDC, nX, nY);

	AddMouse(hMemDC, &m_rScreen, hScrDC, m_iHwndToTrack);

    // select old bitmap back into memory DC and get handle to
    // bitmap of the capture...whatever this even means...	
    HBITMAP hRawBitmap2 = (HBITMAP) SelectObject(hMemDC, hOldBitmap);

	BITMAPINFO tweakableHeader;
	memcpy(&tweakableHeader, pHeader, sizeof(BITMAPINFO));

	if(m_bConvertToI420) {
	  tweakableHeader.bmiHeader.biBitCount = 32;
	  tweakableHeader.bmiHeader.biCompression = BI_RGB;
	  tweakableHeader.bmiHeader.biHeight = -tweakableHeader.bmiHeader.biHeight; // prevent upside down conversion from i420...
	  tweakableHeader.bmiHeader.biSizeImage = GetBitmapSize(&tweakableHeader.bmiHeader);
	}
	
	if(m_bConvertToI420) {
	  // copy it to a temporary buffer first
	  doDIBits(hScrDC, hRawBitmap2, iFinalStretchHeight, pOldData, &tweakableHeader);
	  // memcpy(/* dest */ pOldData, pData, pSample->GetSize()); // 12.8ms for 1920x1080 desktop
	  // TODO smarter conversion/memcpy's here [?] we could combine scaling with rgb32_to_i420 for instance...
	  // or maybe we should integrate with libswscale here so they can request whatever they want LOL. (might be a higher quality i420 conversion...)
	  // now convert it to i420 into the "real" buffer
      rgb32_to_i420(iFinalStretchWidth, iFinalStretchHeight, (const char *) pOldData, (char *) pData);// took 36.8ms for 1920x1080 desktop	
	} else {
	  doDIBits(hScrDC, hRawBitmap2, iFinalStretchHeight, pData, &tweakableHeader);
	}

    // clean up
    DeleteDC(hMemDC);
}
void CPushPinDesktop::CopyScreenToDataBlock(HDC hScrDC, BYTE *pData, BITMAPINFO *pHeader, IMediaSample *pSample)
{
    HDC         hMemDC;         // screen DC and memory DC
    HBITMAP     hOldBitmap;    // handles to device-dependent bitmaps
    int         nX, nY;       // coordinates of rectangle to grab
	int         iFinalStretchHeight = getNegotiatedFinalHeight();
	int         iFinalStretchWidth  = getNegotiatedFinalWidth();
	
    ASSERT_RAISE(!IsRectEmpty(&m_rScreen)); // that would be unexpected
    // create a DC for the screen and create
    // a memory DC compatible to screen DC   
	
    hMemDC = CreateCompatibleDC(hScrDC); //  0.02ms Anything else to reuse, this one's pretty fast...?

    // determine points of where to grab from it, though I think we control these with m_rScreen
    nX  = m_rScreen.left;
    nY  = m_rScreen.top;

	// sanity checks--except we don't want it apparently, to allow upstream to dynamically change the size? Can it do that?
	ASSERT_RAISE(m_rScreen.bottom - m_rScreen.top == iFinalStretchHeight);
	ASSERT_RAISE(m_rScreen.right - m_rScreen.left == iFinalStretchWidth);

    // select new bitmap into memory DC
    hOldBitmap = (HBITMAP) SelectObject(hMemDC, hRawBitmap);

	doJustBitBltOrScaling(hMemDC, m_iCaptureConfigWidth, m_iCaptureConfigHeight, iFinalStretchWidth, iFinalStretchHeight, hScrDC, nX, nY);

	if(m_bCaptureMouse) 
	  AddMouse(hMemDC, &m_rScreen, hScrDC, m_iHwndToTrack);

    // select old bitmap back into memory DC and get handle to
    // bitmap of the capture...whatever this even means...	
    HBITMAP hRawBitmap2 = (HBITMAP) SelectObject(hMemDC, hOldBitmap);

	BITMAPINFO tweakableHeader;
	memcpy(&tweakableHeader, pHeader, sizeof(BITMAPINFO));

	if(m_bConvertToI420) {
	  tweakableHeader.bmiHeader.biBitCount = 32;
	  tweakableHeader.bmiHeader.biCompression = BI_RGB;
	  tweakableHeader.bmiHeader.biHeight = -tweakableHeader.bmiHeader.biHeight; // prevent upside down conversion from i420...
	  tweakableHeader.bmiHeader.biSizeImage = GetBitmapSize(&tweakableHeader.bmiHeader);
	}
	
	if(m_bConvertToI420) {
	  // copy it to a temporary buffer first
	  doDIBits(hScrDC, hRawBitmap2, iFinalStretchHeight, pOldData, &tweakableHeader);
	  // memcpy(/* dest */ pOldData, pData, pSample->GetSize()); // 12.8ms for 1920x1080 desktop
	  // TODO smarter conversion/memcpy's here [?] we could combine scaling with rgb32_to_i420 for instance...
	  // or maybe we should integrate with libswscale here so they can request whatever they want LOL. (might be a higher quality i420 conversion...)
	  // now convert it to i420 into the "real" buffer
      rgb32_to_i420(iFinalStretchWidth, iFinalStretchHeight, (const char *) pOldData, (char *) pData);// took 36.8ms for 1920x1080 desktop	
	} else {
	  doDIBits(hScrDC, hRawBitmap2, iFinalStretchHeight, pData, &tweakableHeader);

	  // if we're on vlc work around for odd pixel widths and 24 bit...<sigh>, like a width of 134 breaks vlc with 24bit. wow. see also GetMediaType comments
	  wchar_t buffer[MAX_PATH + 1]; // on the stack
	  GetModuleFileName(NULL, buffer, MAX_PATH);
	  if(wcsstr(buffer, L"vlc.exe") > 0) {
	    int bitCount = tweakableHeader.bmiHeader.biBitCount;
	    int stride = (iFinalStretchWidth * (bitCount / 8)) % 4; // see if lines have some padding at the end...
	    //int stride2 = (tweakableHeader.bmiHeader.biWidth * (tweakableHeader.bmiHeader.biBitCount / 8) + 3) & ~3; // ??
	    if(stride > 0) {
		  stride = 4 - stride; // they round up to 4 word boundary
		  // don't need to copy the first line :P
		  int lineSizeBytes = iFinalStretchWidth*(bitCount/8);
		  int lineSizeTotal = lineSizeBytes + stride;
		  for(int line = 1; line < iFinalStretchHeight; line++) {
			  //*dst, *src, size
			  // memmove required since these overlap...
			  memmove(&pData[line*lineSizeBytes], &pData[line*lineSizeTotal], lineSizeBytes);
		  }
	    }
	  }
	}

    // clean up
    DeleteDC(hMemDC);
}