int runner_t::run(int argc, char** argv) { if (!is_right_parameters(argc, argv)) { return show_usage(argv); } parameters_t parameters(argc, argv); crawler_t crawler(parameters); crawler.run(); return 0; }
void ExchangeRateNet::init(){ // url = QString("http://quote.hexun.com/forex/forex.aspx?type=3"); url = QString("http://exquote.yjfx.jp/quote.js"); timer = new QTimer(this); connect(timer, SIGNAL(timeout()), this, SLOT(crawler())); mgr = new QNetworkAccessManager(this); connect(mgr, SIGNAL(finished(QNetworkReply*)), this, SLOT(query(QNetworkReply*))); thread = new QThread; eaWorker = new ExchangeRateWorker; eaWorker->moveToThread(thread); connect(this, SIGNAL(doWork(QString)), eaWorker, SLOT(work(QString)), Qt::QueuedConnection); connect(eaWorker, SIGNAL(showMessage(QString)), this, SIGNAL(showMessage(QString))); thread->start(); }
void WebCrawler::crawler(const std::string &url, const size_t depth) { std::string page; std::string newUrl; // Filter output if (matchOutputFilter(url)) { m_filteredUrls.insert(url); } // End if reach depth if (depth > m_depth) return; // Check whether page already crawled if (m_searchedUrls.find(url) != m_searchedUrls.end()) { std::cout << ">>> Omit duplicate URL: " << url << std::endl; return; } // Add paged to crawled pages m_searchedUrls.insert(url); std::cout << ">>> Searching [depth = " << depth << "]: " << url << std::endl; // Fetch page page = m_cw->fetch(url); // Return if page is empty if (page.size() == 0) return; // Iterate all hrefs on page auto hrefs = getHrefs(page); for (auto it = hrefs.rbegin(); it != hrefs.rend(); ++it) { // Omit invalid url if (it->size() == 0 || it->at(0) == '#') continue; // Process Url newUrl = processUrl(url, *it); // std::cout << '\t' << url << " >> " << *it << " >> " << newUrl << std::endl; crawler(newUrl, depth + 1); } }
int main (int argc, char *argv[]) { bool skip_decrypted = false; FileSet fileSet; PathCrawler crawler(&fileSet); printf("---------------------------\n"); printf("DMA recon v1.4\n"); printf("---------------------------\n"); if (argc >= 2 && argv[1][0] == '1') { skip_decrypted = true; printf("Skipping decrypted!\n"); } crawler.listDir(crawler.startPath, skip_decrypted); printf("listing finished...\n"); fileSet.printSummary(); system("pause"); return 0; }
bool CleanupCommand::Execute() { bool bRet = false; CCleanTypeDlg dlg; if( dlg.DoModal() == IDOK) { bool quotepath = g_Git.GetConfigValueBool(_T("core.quotepath")); CString cmd; cmd.Format(_T("git.exe clean")); if (dlg.m_bDryRun || !dlg.m_bNoRecycleBin) cmd += _T(" -n "); if(dlg.m_bDir) cmd += _T(" -d "); switch(dlg.m_CleanType) { case 0: cmd += _T(" -fx"); break; case 1: cmd += _T(" -f"); break; case 2: cmd += _T(" -fX"); break; } STRING_VECTOR submoduleList; SubmodulePayload payload(submoduleList); if (dlg.m_bSubmodules) { payload.basePath = CTGitPath(g_Git.m_CurrentDir).GetGitPathString(); if (pathList.GetCount() != 1 || pathList.GetCount() == 1 && !pathList[0].IsEmpty()) { for (int i = 0; i < pathList.GetCount(); ++i) { CString path; if (pathList[i].IsDirectory()) payload.prefixList.push_back(pathList[i].GetGitPathString()); else payload.prefixList.push_back(pathList[i].GetContainingDirectory().GetGitPathString()); } } if (!GetSubmodulePathList(payload)) return FALSE; std::sort(submoduleList.begin(), submoduleList.end()); } if (dlg.m_bDryRun || dlg.m_bNoRecycleBin) { while (true) { CProgressDlg progress; for (int i = 0; i < this->pathList.GetCount(); ++i) { CString path; if (this->pathList[i].IsDirectory()) path = pathList[i].GetGitPathString(); else path = pathList[i].GetContainingDirectory().GetGitPathString(); progress.m_GitDirList.push_back(g_Git.m_CurrentDir); progress.m_GitCmdList.push_back(cmd + _T(" \"") + path + _T("\"")); } if (dlg.m_bSubmodules) { for (CString dir : submoduleList) { progress.m_GitDirList.push_back(CTGitPath(dir).GetWinPathString()); progress.m_GitCmdList.push_back(cmd); } } INT_PTR idRetry = -1; if (!dlg.m_bDryRun) idRetry = progress.m_PostFailCmdList.Add(CString(MAKEINTRESOURCE(IDS_MSGBOX_RETRY))); INT_PTR result = progress.DoModal(); if (result == IDOK) return TRUE; if (progress.m_GitStatus && result == IDC_PROGRESS_BUTTON1 + idRetry) continue; break; } } else { CSysProgressDlg sysProgressDlg; sysProgressDlg.SetAnimation(IDR_CLEANUPANI); sysProgressDlg.SetTitle(CString(MAKEINTRESOURCE(IDS_APPNAME))); sysProgressDlg.SetLine(1, CString(MAKEINTRESOURCE(IDS_PROC_CLEANUP_INFO1))); sysProgressDlg.SetLine(2, CString(MAKEINTRESOURCE(IDS_PROGRESSWAIT))); sysProgressDlg.SetShowProgressBar(false); sysProgressDlg.ShowModeless((HWND)NULL, true); CTGitPathList delList; for (size_t i = 0; i <= submoduleList.size(); ++i) { CGit git; CGit *pGit; if (i == 0) pGit = &g_Git; else { git.m_CurrentDir = submoduleList[i - 1]; pGit = &git; } CString cmdout, cmdouterr; if (pGit->Run(cmd, &cmdout, &cmdouterr, CP_UTF8)) { MessageBox(nullptr, cmdouterr, _T("TortoiseGit"), MB_ICONERROR); return FALSE; } if (sysProgressDlg.HasUserCancelled()) { CMessageBox::Show(nullptr, IDS_SVN_USERCANCELLED, IDS_APPNAME, MB_OK); return FALSE; } int pos = 0; CString token = cmdout.Tokenize(_T("\n"), pos); while (!token.IsEmpty()) { if (token.Mid(0, 13) == _T("Would remove ")) { CString tempPath = token.Mid(13).TrimRight(); if (quotepath) { tempPath = UnescapeQuotePath(tempPath.Trim(_T('"'))); } if (i == 0) delList.AddPath(CTGitPath(tempPath)); else delList.AddPath(CTGitPath(submoduleList[i - 1] + "/" + tempPath)); } token = cmdout.Tokenize(_T("\n"), pos); } if (sysProgressDlg.HasUserCancelled()) { CMessageBox::Show(nullptr, IDS_SVN_USERCANCELLED, IDS_APPNAME, MB_OK); return FALSE; } } delList.DeleteAllFiles(true, false); sysProgressDlg.Stop(); } } #if 0 CProgressDlg progress; progress.SetTitle(IDS_PROC_CLEANUP); progress.SetAnimation(IDR_CLEANUPANI); progress.SetShowProgressBar(false); progress.SetLine(1, CString(MAKEINTRESOURCE(IDS_PROC_CLEANUP_INFO1))); progress.SetLine(2, CString(MAKEINTRESOURCE(IDS_PROC_CLEANUP_INFO2))); progress.ShowModeless(hwndExplorer); CString strSuccessfullPaths, strFailedPaths; for (int i=0; i<pathList.GetCount(); ++i) { SVN svn; if (!svn.CleanUp(pathList[i])) { strFailedPaths += _T("- ") + pathList[i].GetWinPathString() + _T("\n"); strFailedPaths += svn.GetLastErrorMessage() + _T("\n\n"); } else { strSuccessfullPaths += _T("- ") + pathList[i].GetWinPathString() + _T("\n"); // after the cleanup has finished, crawl the path downwards and send a change // notification for every directory to the shell. This will update the // overlays in the left tree view of the explorer. CDirFileEnum crawler(pathList[i].GetWinPathString()); CString sPath; bool bDir = false; CTSVNPathList updateList; while (crawler.NextFile(sPath, &bDir)) { if ((bDir) && (!g_SVNAdminDir.IsAdminDirPath(sPath))) { updateList.AddPath(CTSVNPath(sPath)); } } updateList.AddPath(pathList[i]); CShellUpdater::Instance().AddPathsForUpdate(updateList); CShellUpdater::Instance().Flush(); updateList.SortByPathname(true); for (INT_PTR i=0; i<updateList.GetCount(); ++i) { SHChangeNotify(SHCNE_UPDATEITEM, SHCNF_PATH, updateList[i].GetWinPath(), NULL); CTraceToOutputDebugString::Instance()(_T(__FUNCTION__) _T(": notify change for path %s\n"), updateList[i].GetWinPath()); } } } progress.Stop(); CString strMessage; if ( !strSuccessfullPaths.IsEmpty() ) { CString tmp; tmp.Format(IDS_PROC_CLEANUPFINISHED, (LPCTSTR)strSuccessfullPaths); strMessage += tmp; bRet = true; } if ( !strFailedPaths.IsEmpty() ) { if (!strMessage.IsEmpty()) strMessage += _T("\n"); CString tmp; tmp.Format(IDS_PROC_CLEANUPFINISHED_FAILED, (LPCTSTR)strFailedPaths); strMessage += tmp; bRet = false; } CMessageBox::Show(hwndExplorer, strMessage, _T("TortoiseGit"), MB_OK | (strFailedPaths.IsEmpty()?MB_ICONINFORMATION:MB_ICONERROR)); #endif CShellUpdater::Instance().Flush(); return bRet; }
void WebCrawler::perform() { crawler(m_url, 0); }
bool CleanupCommand::Execute() { bool bRet = false; CCleanTypeDlg dlg; if( dlg.DoModal() == IDOK) { bool quotepath = g_Git.GetConfigValueBool(_T("core.quotepath")); CString cmd; cmd.Format(_T("git clean")); if (dlg.m_bDryRun || !dlg.m_bNoRecycleBin) cmd += _T(" -n "); if(dlg.m_bDir) cmd += _T(" -d "); switch(dlg.m_CleanType) { case 0: cmd += _T(" -fx"); break; case 1: cmd += _T(" -f"); break; case 2: cmd += _T(" -fX"); break; } if (dlg.m_bDryRun || dlg.m_bNoRecycleBin) { CProgressDlg progress; for (int i = 0; i < this->pathList.GetCount(); ++i) { CString path; if (this->pathList[i].IsDirectory()) path = pathList[i].GetGitPathString(); else path = pathList[i].GetContainingDirectory().GetGitPathString(); progress.m_GitCmdList.push_back(cmd + _T(" \"") + path + _T("\"")); } if (progress.DoModal()==IDOK) return TRUE; } else { CSysProgressDlg sysProgressDlg; sysProgressDlg.SetAnimation(IDR_CLEANUPANI); sysProgressDlg.SetTitle(CString(MAKEINTRESOURCE(IDS_APPNAME))); sysProgressDlg.SetLine(1, CString(MAKEINTRESOURCE(IDS_PROC_CLEANUP_INFO1))); sysProgressDlg.SetLine(2, CString(MAKEINTRESOURCE(IDS_PROGRESSWAIT))); sysProgressDlg.SetShowProgressBar(false); sysProgressDlg.ShowModeless((HWND)NULL, true); CString cmdout, cmdouterr; if (g_Git.Run(cmd, &cmdout, &cmdouterr, CP_UTF8)) { MessageBox(NULL, cmdouterr, _T("TortoiseGit"), MB_ICONERROR); return FALSE; } if (sysProgressDlg.HasUserCancelled()) { CMessageBox::Show(NULL, IDS_SVN_USERCANCELLED, IDS_APPNAME, MB_OK); return FALSE; } int pos = 0; CString token = cmdout.Tokenize(_T("\n"), pos); CTGitPathList delList; while (!token.IsEmpty()) { if (token.Mid(0, 13) == _T("Would remove ")) { CString tempPath = token.Mid(13).TrimRight(); if (quotepath) { tempPath = UnescapeQuotePath(tempPath.Trim(_T('"'))); } delList.AddPath(CTGitPath(tempPath)); } token = cmdout.Tokenize(_T("\n"), pos); } if (sysProgressDlg.HasUserCancelled()) { CMessageBox::Show(NULL, IDS_SVN_USERCANCELLED, IDS_APPNAME, MB_OK); return FALSE; } delList.DeleteAllFiles(true, false); sysProgressDlg.Stop(); } } #if 0 CProgressDlg progress; progress.SetTitle(IDS_PROC_CLEANUP); progress.SetAnimation(IDR_CLEANUPANI); progress.SetShowProgressBar(false); progress.SetLine(1, CString(MAKEINTRESOURCE(IDS_PROC_CLEANUP_INFO1))); progress.SetLine(2, CString(MAKEINTRESOURCE(IDS_PROC_CLEANUP_INFO2))); progress.ShowModeless(hwndExplorer); CString strSuccessfullPaths, strFailedPaths; for (int i=0; i<pathList.GetCount(); ++i) { SVN svn; if (!svn.CleanUp(pathList[i])) { strFailedPaths += _T("- ") + pathList[i].GetWinPathString() + _T("\n"); strFailedPaths += svn.GetLastErrorMessage() + _T("\n\n"); } else { strSuccessfullPaths += _T("- ") + pathList[i].GetWinPathString() + _T("\n"); // after the cleanup has finished, crawl the path downwards and send a change // notification for every directory to the shell. This will update the // overlays in the left tree view of the explorer. CDirFileEnum crawler(pathList[i].GetWinPathString()); CString sPath; bool bDir = false; CTSVNPathList updateList; while (crawler.NextFile(sPath, &bDir)) { if ((bDir) && (!g_SVNAdminDir.IsAdminDirPath(sPath))) { updateList.AddPath(CTSVNPath(sPath)); } } updateList.AddPath(pathList[i]); CShellUpdater::Instance().AddPathsForUpdate(updateList); CShellUpdater::Instance().Flush(); updateList.SortByPathname(true); for (INT_PTR i=0; i<updateList.GetCount(); ++i) { SHChangeNotify(SHCNE_UPDATEITEM, SHCNF_PATH, updateList[i].GetWinPath(), NULL); ATLTRACE(_T("notify change for path %s\n"), updateList[i].GetWinPath()); } } } progress.Stop(); CString strMessage; if ( !strSuccessfullPaths.IsEmpty() ) { CString tmp; tmp.Format(IDS_PROC_CLEANUPFINISHED, (LPCTSTR)strSuccessfullPaths); strMessage += tmp; bRet = true; } if ( !strFailedPaths.IsEmpty() ) { if (!strMessage.IsEmpty()) strMessage += _T("\n"); CString tmp; tmp.Format(IDS_PROC_CLEANUPFINISHED_FAILED, (LPCTSTR)strFailedPaths); strMessage += tmp; bRet = false; } CMessageBox::Show(hwndExplorer, strMessage, _T("TortoiseGit"), MB_OK | (strFailedPaths.IsEmpty()?MB_ICONINFORMATION:MB_ICONERROR)); #endif CShellUpdater::Instance().Flush(); return bRet; }
//---------------------------------------------------------------- // IElement::load // uint64 IElement::load(IStorage & storage, uint64 bytesToRead, IDelegateLoad * loader) { if (!bytesToRead) { return 0; } // save a storage receipt so that element position references // can be resolved later: IStorage::IReceiptPtr storageReceipt = storage.receipt(); // save current seek position, so it can be restored if necessary: IStorage::TSeek storageStart(storage); uint64 eltId = loadEbmlId(storage); if (eltId != getId()) { // element id wrong for my type: return 0; } #if 0 // !defined(NDEBUG) && (defined(DEBUG) || defined(_DEBUG)) Indent::More indentMore(Indent::depth_); { IStorage::TSeek restore(storage); uint64 vsizeSize = 0; uint64 vsize = vsizeDecode(storage, vsizeSize); std::cout << indent() << std::setw(8) << uintEncode(getId()) << " @ " << std::hex << "0x" << storageStart.absolutePosition() << std::dec << " -- " << getName() << ", payload " << vsize << " bytes" << std::endl; } #endif // this appears to be a good payload: storageStart.doNotRestore(); // store the storage receipt: receipt_ = storageReceipt; // read payload size: uint64 vsizeSize = 0; uint64 payloadSize = vsizeDecode(storage, vsizeSize); const bool payloadSizeUnknown = (payloadSize == uintMax[8]); // keep track of the number of bytes read successfully: receipt_->add(uintNumBytes(eltId)); receipt_->add(vsizeSize); // clear the payload checksum: setCrc32(false); storedCrc32_ = 0; computedCrc32_ = 0; // save the payload storage receipt so that element position references // can be resolved later: IStorage::IReceiptPtr receiptPayload = storage.receipt(); offsetToPayload_ = receiptPayload->position() - receipt_->position(); offsetToCrc32_ = kUndefinedOffset; // shortcut: IPayload & payload = getPayload(); // container elements may be present in any order, therefore // not every load will succeed -- keep trying until all // load attempts fail: uint64 payloadBytesToRead = payloadSize; uint64 payloadBytesReadTotal = 0; while (payloadBytesToRead) { uint64 prevPayloadBytesToRead = payloadBytesToRead; // try to load some part of the payload: uint64 partialPayloadSize = 0; if (loader) { uint64 bytesRead = loader->load(storage, payloadBytesToRead, eltId, payload); if (bytesRead == uintMax[8]) { // special case, indicating that the loader doesn't // want to read any more data: storageStart.doRestore(); loader->loaded(*this); return 0; } partialPayloadSize += bytesRead; payloadBytesToRead -= bytesRead; } if (!partialPayloadSize) { uint64 bytesRead = payload.load(storage, payloadBytesToRead, loader); partialPayloadSize += bytesRead; payloadBytesToRead -= bytesRead; } // consume any void elements that may exist: IPayload::TVoid eltVoid; uint64 voidPayloadSize = eltVoid.load(storage, payloadBytesToRead, loader); if (voidPayloadSize) { payloadBytesToRead -= voidPayloadSize; // find an element to store the Void element, so that // the relative element order would be preserved: IStorage::IReceiptPtr voidReceipt = eltVoid.storageReceipt(); FindElement crawler(voidReceipt->position() - 2); if (partialPayloadSize) { crawler.evalPayload(payload); assert(crawler.eltFound_); } if (crawler.eltFound_) { IPayload & dstPayload = crawler.eltFound_->getPayload(); dstPayload.voids_.push_back(eltVoid); } else { payload.voids_.push_back(eltVoid); } } // consume the CRC-32 element if it exists: payloadBytesToRead -= loadCrc32(storage, payloadBytesToRead); uint64 payloadBytesRead = prevPayloadBytesToRead - payloadBytesToRead; payloadBytesReadTotal += payloadBytesRead; if (payloadBytesRead == 0) { break; } } if (payloadBytesReadTotal < payloadSize && !payloadSizeUnknown) { // skip unrecognized alien data: uint64 alienDataSize = payloadSize - payloadBytesReadTotal; #if !defined(NDEBUG) && (defined(DEBUG) || defined(_DEBUG)) std::cerr << indent() << "WARNING: " << getName() << " 0x" << uintEncode(getId()) << " -- skipping " << alienDataSize << " bytes of unrecognized alien data @ 0x" << std::hex << storage.receipt()->position() << std::dec << std::endl; #endif storage.skip(alienDataSize); payloadBytesReadTotal = payloadSize; } receiptPayload->add(payloadBytesReadTotal); *receipt_ += receiptPayload; // verify stored payload CRC-32 checksum: if (shouldComputeCrc32()) { IStorage::IReceiptPtr receiptCrc32 = crc32Receipt(); Crc32 crc32; receiptPayload->calcCrc32(crc32, receiptCrc32); computedCrc32_ = crc32.checksum(); if (computedCrc32_ != storedCrc32_) { #if 1 // !defined(NDEBUG) && (defined(DEBUG) || defined(_DEBUG)) std::cerr << indent() << "WARNING: " << getName() << " 0x" << uintEncode(getId()) << " -- checksum mismatch, loaded " << std::hex << storedCrc32_ << ", computed " << computedCrc32_ << ", CRC-32 @ 0x" << receiptCrc32->position() << ", payload @ 0x" << receiptPayload->position() << ":" << receiptPayload->numBytes() << std::dec << std::endl; Crc32 doOverCrc32; receiptPayload->calcCrc32(doOverCrc32, receiptCrc32); #endif } } if (loader && receipt_->numBytes()) { // allow the delegate to perform post-processing on the loaded element: loader->loaded(*this); } return receipt_->numBytes(); }
int main(int argc, char **argv) { if (argc != 3) { printUsage(argv[0]); return 0; } if (strcmp(argv[1], "--crawl") == 0) { Logger::setOutputFile("log-crawl_test.txt"); WebCrawler crawler(120000); crawler.setDownloadInterval(5); crawler.setPagesDir("data/pages_test"); if (!crawler.startCrawl(argv[2])) return 0; crawler.saveToDisk("data/pagesData_total.txt"); } else if (strcmp(argv[1], "--resume-crawl") == 0) { Logger::setOutputFile("log-crawl.txt"); WebCrawler crawler(120000); crawler.setDownloadInterval(5); crawler.setPagesDir("pages"); if (!crawler.resumeCrawl(argv[2])) return 0; crawler.saveToDisk("data/pagesData_total.txt"); } else if (strcmp(argv[1], "--stat") == 0) { Logger::setOutputFile("log-stat.txt"); PagesStatist statist(argv[2]); const std::vector<PR>& pageRank = statist.getPageRank(); TMK_LOG_ALL("Saving PR to file \n"); tmk::saveToDisk(pageRank, "data/PR.txt"); std::vector<std::pair<Url, PR> > top20; statist.getTopPages(20, top20); TMK_LOG_ALL("Top 20 pages by PR:\n"); for (int i = 0; i < top20.size(); ++i) { TMK_LOG_ALL("PR: %f\t%s\n", top20[i].second, top20[i].first.c_str()); } const std::vector<size_t>& pagesSize = statist.getPageSizesInBytes(); TMK_LOG_ALL("Saving pages Sizes to file \n"); tmk::saveToDisk(pagesSize, "data/pageSize.txt"); const std::vector<size_t>& pagesOutgoingLinksCount = statist.getPagesOutgoingLinksCount(); TMK_LOG_ALL("Saving pages outgoing links count to file\n"); tmk::saveToDisk(pagesOutgoingLinksCount, "data/pageOutLinks.txt"); const std::vector<size_t>& pagesIncomingLinksCount = statist.getPagesIncomingLinksCount(); TMK_LOG_ALL("Saving pages incoming links count to file\n"); tmk::saveToDisk(pagesIncomingLinksCount, "data/pageInLinks.txt"); const std::vector<size_t>& pageDistancesFromMain = statist.getPageDistancesFromMain(); TMK_LOG_ALL("Saving pages distances from main page to file\n"); tmk::saveToDisk(pageDistancesFromMain, "data/pageDistances.txt"); size_t maxPageDist = 0; for (auto it = pageDistancesFromMain.begin(); it != pageDistancesFromMain.end(); ++it) { maxPageDist = std::max(maxPageDist, *it); } TMK_LOG_ALL("Maximal distance from main page: %zu\n", maxPageDist); } else { printUsage(argv[0]); } return 0; }