Beispiel #1
0
void CHtmlParser::testParseHtml(LPCTSTR szHTML)
{
	HRESULT hr = 0;

	try
	{
		MSHTML::IHTMLDocument2Ptr pDoc;
		hr = pDoc.CreateInstance(CLSID_HTMlDocument);

		SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
		VARIANT* param;
		bstr_t bsData = szHTML;
		hr = SafeArrayAccessData(psa, (LPVOID*)&param);
		param->vt = VT_BSTR;
		param->bstrVal = (BSTR)bsData;

		hr = pDoc->write(psa);
		hr = pDoc->close();

		SafeArrayDestroy(psa);

		_bstr_t body = pDoc->body->innerHTML;
	
	} catch(_com_error& e)
	{
		e;
	}
}
Beispiel #2
0
int main(int argc, char* argv[]) {
	CoInitialize(NULL);

	ofstream dbfile("output.db");
	string sLI;
	string m_strURL;
	HINTERNET hOpen, hFile;

	MSHTML::IHTMLDocument2Ptr pDoc;
	HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (void**)&pDoc);

	SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
	VARIANT *param;

	hOpen = InternetOpen(L"UN/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);

	hFile = InternetOpenUrl(hOpen, L"http://cnproxy.com/proxy1.html", NULL, 0, 0, 0);

	if (hFile) {
		CHAR buffer[100 * 1024];
		DWORD dwRead;

		while (InternetReadFile(hFile, buffer, 1024, &dwRead)) {
			if (dwRead == 0)
				break;

			buffer[dwRead] = 0;

			bstr_t bsData = (LPCTSTR)buffer;
			hr = SafeArrayAccessData(psa, (LPVOID*)&param);
			param->vt = VT_BSTR;
			param->bstrVal = (BSTR)bsData;

			cout << buffer << endl;
			dbfile << buffer << endl;

			hr = pDoc->write(psa);

		} //end while loop

		hr = pDoc->close();
		InternetCloseHandle(hFile);
		SafeArrayDestroy(psa);
	}

	InternetCloseHandle(hOpen);
	dbfile.close();

	CoUninitialize();
	return 1;
}
/**
 * It Downloads all the resources, main function of concern, all the required members variables must 
 * be populated for it to work correctly
 */
void COfflineBrowser::BrowseOffline()
{
	USES_CONVERSION;

	//Delete if this directory exists, it is a disputed call, you can safely comment it out
	DeleteDirectory(m_sDir);
	//Create the directory again
	CreateDirectory(m_sDir.c_str(), NULL);
	//SaveHtml(m_sHtml);
	
	//Create HTML Document which will be hold the html that was have 
	MSHTML::IHTMLDocument2Ptr pDoc;
	HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, 
								  IID_IHTMLDocument2, (void**)&pDoc);
	if(pDoc == NULL)
		return;

	//Load HTML to Html Document
	SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
	VARIANT *param;
	bstr_t bsData = (LPCTSTR)m_sHtml.c_str();
	hr =  SafeArrayAccessData(psa, (LPVOID*)&param);
	param->vt = VT_BSTR;
	param->bstrVal = (BSTR)bsData;
	
	//write your buffer
	hr = pDoc->write(psa);	
	//closes the document, "applying" your code  
	hr = pDoc->close();	

	//Don't forget to free the SAFEARRAY!
	SafeArrayDestroy(psa);

	//Iterate through all the elements in the document
	MSHTML::IHTMLElementCollectionPtr pCollection = pDoc->all;

	for(long a=0;a<pCollection->length;a++)
	{
		std::string sValue;
		IHTMLElementPtr pElem = pCollection->item( a );
		//If src attribute is found that means we've a resource to download
		if(GetAttribute(pElem, L"src", sValue))
		{
			//If resource URL is relative
			if(!IsAbsolute(sValue))
			{
				if(sValue[0] == '/')
					sValue = sValue.substr(1, sValue.length()-1);
				//Create directories needed to hold this resource
				//CreateDirectories(sValue, m_sDir);
				//Download the resource
				if(1)//!DownloadResource(sValue, sValue))
				{
					std::string sTemp = m_sScheme + m_sHost;
					sTemp += sValue;
					//Update src to the new src and put the original src attribute as
					//srcdump just for future references
					if(sTemp[0] == '/')
						sTemp = sTemp.substr(1, sTemp.length()-1);
					SetAttribute(pElem, L"src", sTemp);
					SetAttribute(pElem, L"srcdump", sValue);
				}
				//Unable to download the resource
				else
				{
					//Put srcdump same as src, It if for no use, I just put it to make
					//HTML DOM consistent
					SetAttribute(pElem, L"srcdump", sValue);
				}
			}
			//If resource URL is absolute
			else
			{
				std::string sTemp;
				//Make URL relative
				sTemp = TrimHostName(sValue);
				//Create directories needed to hold this resource
				//CreateDirectories(sTemp, m_sDir);
				//Dowload the resource
				if(1)//DownloadResource(sTemp, sTemp))
				{
					//Update src to the new src and put the original src attribute as
					//srcdump just for future references
					if(sTemp[0] == '/')
						sTemp = sTemp.substr(1, sTemp.length()-1);
					SetAttribute(pElem, L"src", sTemp);
					SetAttribute(pElem, L"srcdump", sValue);
				}
			}
		}
	}
	
	//Get upated HTML out of amendments we made and save it to the described directory
	MSHTML::IHTMLDocument3Ptr pDoc3 = pDoc;
	MSHTML::IHTMLElementPtr pDocElem;
	pDoc3->get_documentElement(&pDocElem);
	BSTR bstrHtml;
	pDocElem->get_outerHTML(&bstrHtml);
	std::string sNewHtml((const wchar_t*)OLE2T(bstrHtml));
	SaveHtml(sNewHtml);
}