void CHtmlParser::testParseHtml(LPCTSTR szHTML) { HRESULT hr = 0; try { MSHTML::IHTMLDocument2Ptr pDoc; hr = pDoc.CreateInstance(CLSID_HTMlDocument); SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1); VARIANT* param; bstr_t bsData = szHTML; hr = SafeArrayAccessData(psa, (LPVOID*)¶m); param->vt = VT_BSTR; param->bstrVal = (BSTR)bsData; hr = pDoc->write(psa); hr = pDoc->close(); SafeArrayDestroy(psa); _bstr_t body = pDoc->body->innerHTML; } catch(_com_error& e) { e; } }
int main(int argc, char* argv[]) { CoInitialize(NULL); ofstream dbfile("output.db"); string sLI; string m_strURL; HINTERNET hOpen, hFile; MSHTML::IHTMLDocument2Ptr pDoc; HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (void**)&pDoc); SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1); VARIANT *param; hOpen = InternetOpen(L"UN/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0); hFile = InternetOpenUrl(hOpen, L"http://cnproxy.com/proxy1.html", NULL, 0, 0, 0); if (hFile) { CHAR buffer[100 * 1024]; DWORD dwRead; while (InternetReadFile(hFile, buffer, 1024, &dwRead)) { if (dwRead == 0) break; buffer[dwRead] = 0; bstr_t bsData = (LPCTSTR)buffer; hr = SafeArrayAccessData(psa, (LPVOID*)¶m); param->vt = VT_BSTR; param->bstrVal = (BSTR)bsData; cout << buffer << endl; dbfile << buffer << endl; hr = pDoc->write(psa); } //end while loop hr = pDoc->close(); InternetCloseHandle(hFile); SafeArrayDestroy(psa); } InternetCloseHandle(hOpen); dbfile.close(); CoUninitialize(); return 1; }
/** * It Downloads all the resources, main function of concern, all the required members variables must * be populated for it to work correctly */ void COfflineBrowser::BrowseOffline() { USES_CONVERSION; //Delete if this directory exists, it is a disputed call, you can safely comment it out DeleteDirectory(m_sDir); //Create the directory again CreateDirectory(m_sDir.c_str(), NULL); //SaveHtml(m_sHtml); //Create HTML Document which will be hold the html that was have MSHTML::IHTMLDocument2Ptr pDoc; HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (void**)&pDoc); if(pDoc == NULL) return; //Load HTML to Html Document SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1); VARIANT *param; bstr_t bsData = (LPCTSTR)m_sHtml.c_str(); hr = SafeArrayAccessData(psa, (LPVOID*)¶m); param->vt = VT_BSTR; param->bstrVal = (BSTR)bsData; //write your buffer hr = pDoc->write(psa); //closes the document, "applying" your code hr = pDoc->close(); //Don't forget to free the SAFEARRAY! SafeArrayDestroy(psa); //Iterate through all the elements in the document MSHTML::IHTMLElementCollectionPtr pCollection = pDoc->all; for(long a=0;a<pCollection->length;a++) { std::string sValue; IHTMLElementPtr pElem = pCollection->item( a ); //If src attribute is found that means we've a resource to download if(GetAttribute(pElem, L"src", sValue)) { //If resource URL is relative if(!IsAbsolute(sValue)) { if(sValue[0] == '/') sValue = sValue.substr(1, sValue.length()-1); //Create directories needed to hold this resource //CreateDirectories(sValue, m_sDir); //Download the resource if(1)//!DownloadResource(sValue, sValue)) { std::string sTemp = m_sScheme + m_sHost; sTemp += sValue; //Update src to the new src and put the original src attribute as //srcdump just for future references if(sTemp[0] == '/') sTemp = sTemp.substr(1, sTemp.length()-1); SetAttribute(pElem, L"src", sTemp); SetAttribute(pElem, L"srcdump", sValue); } //Unable to download the resource else { //Put srcdump same as src, It if for no use, I just put it to make //HTML DOM consistent SetAttribute(pElem, L"srcdump", sValue); } } //If resource URL is absolute else { std::string sTemp; //Make URL relative sTemp = TrimHostName(sValue); //Create directories needed to hold this resource //CreateDirectories(sTemp, m_sDir); //Dowload the resource if(1)//DownloadResource(sTemp, sTemp)) { //Update src to the new src and put the original src attribute as //srcdump just for future references if(sTemp[0] == '/') sTemp = sTemp.substr(1, sTemp.length()-1); SetAttribute(pElem, L"src", sTemp); SetAttribute(pElem, L"srcdump", sValue); } } } } //Get upated HTML out of amendments we made and save it to the described directory MSHTML::IHTMLDocument3Ptr pDoc3 = pDoc; MSHTML::IHTMLElementPtr pDocElem; pDoc3->get_documentElement(&pDocElem); BSTR bstrHtml; pDocElem->get_outerHTML(&bstrHtml); std::string sNewHtml((const wchar_t*)OLE2T(bstrHtml)); SaveHtml(sNewHtml); }