void parseHTML(WebPage *currpage, List *currlist, HashTable *currhash, int initdepth){ int position = 0; char* newurl;// = NULL; char *end; //Parsed through the HTML file and gets every URL while((position=GetNextURL(currpage->html, position, currpage->url, &newurl)) > 0 ){ //Normalized the URL and if it is bad, it will be freed right away if (NormalizeURL(newurl)){ //Deals with internal references if((end=strchr(newurl,'#'))){ *end='\0'; } // checks to make sure that the URL has the defined domain if( strstr(newurl,URL_PREFIX) != NULL ){ if ( insertHashTable(currhash,newurl) == 0 ){ //Creates a new webpage for the found url and adds it to the our List if it successfully adds to the hashtable char* dummyurl = (char *)malloc(strlen(newurl)+1); strcpy(dummyurl,newurl); WebPage *newpage=createWebPage(dummyurl, initdepth); addtolist(currlist,newpage); free(newurl); } else free(newurl); } else free(newurl); } else free(newurl); } free(end); }
void CrawlPage(WebPage webpage){ char* nexturl= NULL; int lastpos = 0; int depth = webpage.depth + 1; if(depth > maxWebPageDepth) return; printf("\n\n[crawler]: Crawling - %s\n\n",webpage.url); while((lastpos = GetNextURL(webpage.html, lastpos, webpage.url, &nexturl))>0){ NormalizeURL(nexturl); if(!CheckURL(nexturl)){ // setup new page struct WebPage* newwebpage = (WebPage*)calloc(1,sizeof(WebPage)); newwebpage->url = (char*)calloc(strlen(nexturl)+1, sizeof(char)); strcpy(newwebpage->url,nexturl); newwebpage->depth = depth; // get new webpage if(GetWebPage(newwebpage)){ if(HashTableInsert(nexturl)){ // If not found in hash table, add to hash table printf("[crawler]: Parser found new link - %s\n",nexturl); struct ListNode* listentry = (ListNode*)calloc(1,sizeof(ListNode)); listentry->page = newwebpage; // then add to list WebPageList->tail = InsertNode(WebPageList->tail,listentry); WriteFile(*newwebpage, filenum++); // then write file } else{ CleanUpPage(newwebpage); } } } free(nexturl); nexturl = NULL; // Sleep for a second sleep(INTERVAL_PER_FETCH); } }
int main(int argc, char* argv[]) { //Check for the number of arguments if(argc != 4){ printf("Invalid Input Argument\n"); printHelp(); exit(1); } //direcotry file path int dirSize = strlen(argv[2]); char dir[dirSize + 1]; dir[0] = '\0'; strcat(dir, argv[2]); int urlSize = strlen(argv[1]); char inputURL[urlSize + 1]; inputURL[0] = '\0'; strcat(inputURL, argv[1]); //Get the max depth number. int inputDepth = atoi(argv[3]); //Check if correct depth is provided. if(inputDepth > 4 || inputDepth < 0){ printf("Invalid [depth]\n"); printHelp(); exit(1); } //Check for URL validity if(!strstr(inputURL,URL_PREFIX)){ printf("Invalid input [seed url]\n"); printHelp(); exit(1); } //checkf for directory location validity DIR* directory = opendir(dir); if(directory){ closedir(directory); } else if(ENOENT == errno){ printf("Directory does not exist\n"); printHelp(); exit(1); } else{ printf("Directory can't be opened\n"); printHelp(); exit(1); } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage* seedWebPage = calloc(1, sizeof(WebPage));//Memory allocation for seed webpage seedWebPage->url = calloc((strlen(inputURL) + 1), sizeof(char));//Memory allocation to the seedURL seedWebPage->url[0] = '\0'; strcat(seedWebPage->url, inputURL); seedWebPage->depth = 0; seedWebPage->html = NULL; //Initialize data structures HashTable* visitedURLHash = initHashTable(); List* webPageList = initializeList(); webPageList->head->page = seedWebPage; //get seed webpage. if(GetWebPage(seedWebPage)){ // write seed file FILE *fPointer; char* pathVar1 = pathToDirectory(dir, fileNumber); fPointer = fopen(pathVar1, "w"); free(pathVar1); writeHTMLtoFile(fPointer, seedWebPage); //free(fPointer); if(inputDepth == 0){ curl_global_cleanup(); free(seedWebPage->html); free(seedWebPage->url); free(seedWebPage); //free webPageList and hashtable free(webPageList); for(int i = 0; i < MAX_HASH_SLOT; i++){ free(visitedURLHash->table[i]->url); free(visitedURLHash->table[i]); } free(visitedURLHash); return 0; } fileNumber += 1; depth += 1; HashTableInsert(visitedURLHash, seedWebPage->url); //mark as visited // extract urls from seed page char * result; int pos = 0; while((pos = GetNextURL(seedWebPage->html, pos, seedWebPage->url, &result))>0){ if(NormalizeURL(result) && strstr(result,URL_PREFIX)){ strtok(result, "#"); //If not in hashtable, add it to the hashtable and add it to the webPageList. if(HashTableLookup(visitedURLHash, result) == 0){ HashTableInsert(visitedURLHash, result); AppendList(webPageList, webPageInit(result, depth)); free(result); } } } if(webPageList->head->next->next == NULL){ //seed redirect case webPageList->head->next->page->depth = 0; fileNumber = 1; } tempWebPage = PopList(webPageList); // Get rid of visited seedPage } else{ curl_global_cleanup(); tempWebPage = PopList(webPageList); free(seedWebPage->html); free(seedWebPage->url); free(seedWebPage); //free(tempWebPage); free(webPageList); for(int i = 0; i < MAX_HASH_SLOT; i++){ free(visitedURLHash->table[i]->url); free(visitedURLHash->table[i]); } free(visitedURLHash); exit(1); } //while there are urls to crawl while(webPageList->head != NULL && webPageList->tail != NULL){ // get webpage for url tempWebPage = PopList(webPageList); if(GetWebPage(tempWebPage)){ // write page file char* pathVar = pathToDirectory(dir, fileNumber); FILE *fPointer = fopen(pathVar, "w"); free(pathVar); printf("Found link: %s\n",tempWebPage->url); writeHTMLtoFile(fPointer, tempWebPage); fileNumber += 1; if((tempWebPage->depth + 1) <= inputDepth ){ char * resultTemp; int posTemp = 0; while((posTemp = GetNextURL(tempWebPage->html, posTemp, tempWebPage->url, &resultTemp))>0){ if( NormalizeURL(resultTemp) && strstr(resultTemp,URL_PREFIX) ){ strtok(resultTemp, "#"); //insert to the hashtable and the webPageList if not already present if(HashTableLookup(visitedURLHash, resultTemp) == 0){ HashTableInsert(visitedURLHash, resultTemp); AppendList(webPageList, webPageInit(resultTemp, tempWebPage->depth+1)); } } free(resultTemp); } } free(tempWebPage->url); free(tempWebPage->html); free(tempWebPage); } else{ free(tempWebPage->url); free(tempWebPage->html); free(tempWebPage); } sleep(INTERVAL_PER_FETCH); } // cleanup curl curl_global_cleanup(); free(seedWebPage->url); free(seedWebPage->html); free(seedWebPage); free(webPageList); //free the hashtable for(int i = 0; i < MAX_HASH_SLOT; i++){ if(visitedURLHash->table[i]->url != NULL){ HashTableNode* currNode = visitedURLHash->table[i]; while(currNode->next != NULL){ HashTableNode* tempNode = currNode; currNode = currNode->next; free(tempNode->url); free(tempNode); } free(currNode->url); free(currNode); } else{ free(visitedURLHash->table[i]); } } free(visitedURLHash); return 0; }
// Function to crawl a given webpage for links. int CrawlPage(WebPage *wp) { char *result; // variable to hold the url. int pos = 0; // position in each html page. WebPage *newPage; // New webpage. // Check that the depth does not exceed the depth passed. if (wp->depth >= depth) { return 0; } printf("\n"); printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled. printf("\n"); // Loop through each html page to get all its urls. while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) { // Check that the url has proper domain (old-www). if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) { free(result); continue; } // Normalize the url. if (!NormalizeURL(result)) { free(result); continue; } // Check that the url isn't already in the hash table. if (!InHashTable(result)) { AddToHashTable(result); // Add the url to the hash table. // Setup new page for each url. newPage = calloc(1, sizeof(WebPage)); newPage->depth = wp->depth + 1; newPage->url = (char *)malloc(strlen(result) + 1); if (!newPage->url) { // Check that memory was allocated. continue; } strcpy(newPage->url, result); // Get html for each url. if (!GetWebPage(newPage)) { FreeWebMemory(newPage); free(result); continue; } printf("[crawler]: Parser found link - %s\n", result); // Add to the list of webpages to be visited. if (!AppendList(newPage)) { free(result); return 0; } } free(result); } return 1; }
// main crawler function int main(int argc, char* argv[]) { // local variables FILE *fp; // file pointer for html files char *nextURL; // pointer to the next URL found on the seed page char *newURL; // pointer to the next URL in the while loop // check command line arguments if (argc != 4) { printf("Incorrect number of arguments provided."); exit(1); } // check that the second argument is a directory stat(argv[2],&statbuffer); if S_ISDIR(statbuffer.st_mode) { } else { printf("Error, you did not supply a valid directory"); exit(1); } // get arguments char *seedURL = argv[1]; int filename_len = strlen(argv[2])+21; // get the directory char*filename = calloc(filename_len,sizeof(char)); // check the maxDepth int value = is_numeric(argv[3]); if (value != 0) { sscanf(argv[3],"%i",&maxDepth); } else { printf("Error! maxDepth must be a number"); exit(1); } // init curl curl_global_init(CURL_GLOBAL_ALL); // initialize data structures/variables // initialize hashtable HashTable *table = malloc(sizeof(HashTable)); memset(table,0,MAX_HASH_SLOT); // initialize linked list List *WebPageList; WebPageList = createList(); // setup seed page // get seed webpage // if it fails, report and exit if (NormalizeURL(seedURL) == 0) { printf("Error, bad URL"); exit(1); } // write seed file // create WebPage object by allocating memory WebPage *seedPage = malloc(sizeof(WebPage)); // assign values to each part of the struct seedPage->url = seedURL; seedPage->html = NULL; seedPage->html_len = 0; seedPage->depth = 0; // try to get the webpage up to MAX_TRY times if (!GetWebPage(seedPage)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(seedPage)) { break; } } } // write html contents to a file "1" in the given directory sprintf(filename,"%s/%d",argv[2],1); fp = fopen(filename,"w"); fputs(seedURL,fp); fputs("\n",fp); fprintf(fp,"%d\n",seedPage->depth); fputs(seedPage->html,fp); // close the file and wipe the filename fclose(fp); memset(filename,'\0',filename_len); // add seed page to hashtable add(table,seedURL); // extract urls from seed page // while there are still URLs in the seed page's html while ((pos = GetNextURL(seedPage->html,pos,seedPage->url,&nextURL)) > 0) { // only visiting them if it wouldn't exceed maxDepth if ((seedPage->depth+1) > maxDepth) { free(seedPage); exit(1); } // ensure it's a valid url if (NormalizeURL(nextURL) != 0) { // also check if its in the right domain if (strncmp(URL_PREFIX,nextURL,strlen(URL_PREFIX)) == 0) { // if it is added to the hashtable it is a unique URL that // hasn't been visited before, add it to the linked list // of URLs to visit if (add(table,nextURL)) { // create a new webpage object WebPage *pages = malloc(sizeof(WebPage)); pages->url = nextURL; pages->html = NULL; pages->html_len = 0; pages->depth = 1; // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(pages)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(pages)) { break; } } } // add it to the linked list addToEnd(WebPageList,pages); } } } } // while there are urls to crawl while (WebPageList->head != NULL) { // get next url from list WebPage *nextPage = malloc(sizeof(WebPage)); nextPage = removeFromFront(WebPageList); // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(nextPage)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(nextPage)) { break; } } } // write page file sprintf(filename,"%s/%d",argv[2],docNum); fp = fopen(filename,"w"); fputs(nextPage->url,fp); fputs("\n",fp); fprintf(fp,"%d\n",nextPage->depth); fputs(nextPage->html,fp); // close the file and wipe the filename (to be used next time) fclose(fp); memset(filename,'\0',filename_len); // increment the doc num docNum++; // check if visiting the URLs on this page will exceed maxDepth if ((nextPage->depth+1) > maxDepth) { free(nextPage); continue; } pos = 0; // iterate through all the URLs on the page while ((pos = GetNextURL(nextPage->html,pos,nextPage->url,&newURL))>0) { // check to ensure that the URLs are the proper format if (NormalizeURL(newURL) != 0 ) { // check to ensure that they are in the right domain if (strncmp(URL_PREFIX,newURL,strlen(URL_PREFIX)) == 0) { // making sure to only add new ones to the list if (add(table,newURL) != 0) { // create a new WebPage object WebPage *page = malloc(sizeof(WebPage)); page->url = newURL; page->html = NULL; page->html_len = 0; page->depth = nextPage->depth + 1; GetWebPage(page); // try to get the webpage up until the MAX_TRY tries = 0; if (!GetWebPage(page)) { for (tries = 0; tries < MAX_TRY; tries++) { if (GetWebPage(page)) { break; } } } // add the page to the linked list addToEnd(WebPageList,page); } } } } // Sleep for a bit to avoid annoying the target sleep(INTERVAL_PER_FETCH); // Free resources free(nextPage); } // cleanup curl curl_global_cleanup(); // free resources // free hashtable hash = JenkinsHash(seedURL,MAX_HASH_SLOT); HashTableNode *freer = table->table[hash]; HashTableNode *tempHash = NULL; while (freer != NULL) { tempHash = freer; freer = freer->next; free(tempHash); } free(table); // free linked list free(WebPageList); // free WebPage and filename pointer free(seedPage); free(filename); return 0; }
int GetNextURL(char *html, char *urlofthispage, char *result, int pos) { char c; int len, i, j; char *p1; //!< pointer pointed to the start of a new-founded URL. char *p2; //!< pointer pointed to the end of a new-founded URL. // NEW // Clean up \n chars if(pos == 0) { removeWhiteSpace(html); } // /NEW // Find the <a> <A> HTML tag. while (0 != (c = html[pos])) { if ((c=='<') && ((html[pos+1] == 'a') || (html[pos+1] == 'A'))) { break; } pos++; } //! Find the URL it the HTML tag. They usually look like <a href="www.cs.dartmouth.edu"> //! We try to find the quote mark in order to find the URL inside the quote mark. if (c) { // Added by Matt Mukerjee // check for equals first... some HTML tags don't have quotes...or use single quotes instead p1 = strchr(&(html[pos+1]), '='); if ((!p1) || (*(p1-1) == 'e') || ((p1 - html - pos) > 10)) { // keep going... return GetNextURL(html,urlofthispage,result,pos+1); } if (*(p1+1) == '\"' || *(p1+1) == '\'') p1++; // added by Matt Mukerjee p1++; // added by Matt Mukerjee p2 = strpbrk(p1, "\'\">"); if (!p2) { // Added by Matt Mukerjee // keep going... return GetNextURL(html,urlofthispage,result,pos+1); } if (*p1 == '#') { // Why bother returning anything here....recursively keep going... // Added by Matt Mukerjee return GetNextURL(html,urlofthispage,result,pos+1); } if (!strncmp(p1, "mailto:",7)) return GetNextURL(html, urlofthispage, result, pos+1); if (!strncmp(p1, "javascript:",11)) // Added by Xiaochao return GetNextURL(html, urlofthispage, result, pos+1); if (!strncmp(p1, "http", 4) || !strncmp(p1, "HTTP", 4)) { //! Nice! The URL we found is in absolute path. strncpy(result, p1, (p2-p1)); return (int)(p2 - html + 1); } else { //! We find a URL. HTML is a terrible standard. So there are many ways to present a URL. if (p1[0] == '.') { //! Some URLs are like <a href="../../../a.txt"> I cannot handle this. // again...probably good to recursively keep going.. // NEW return GetNextURL(html,urlofthispage,result,pos+1); // /NEW } if (p1[0] == '/') { //! this means the URL is the absolute path for (i = 7; i < strlen(urlofthispage); i++) if (urlofthispage[i] == '/') break; strcpy(result, urlofthispage); result[i] = 0; strncat(result, p1, (p2 - p1)); return (int)(p2 - html + 1); } else { //! the URL is a absolute path. len = strlen(urlofthispage); for (i = (len - 1); i >= 0; i--) if (urlofthispage[i] == '/') break; for (j = (len - 1); j >= 0; j--) if (urlofthispage[j] == '.') break; if (i == (len -1)) { //! urlofthis page is like http://www.cs.dartmouth.edu/ strcpy(result, urlofthispage); result[i + 1] = 0; strncat(result, p1, p2 - p1); return (int)(p2 - html + 1); } if ((i <= 6)||(i > j)) { //! urlofthispage is like http://www.cs.dartmouth.edu/~abc //! or http://www.cs.dartmouth.edu strcpy(result, urlofthispage); result[len] = '/'; strncat(result, p1, p2 - p1); return (int)(p2 - html + 1); } strcpy(result, urlofthispage); result[i + 1] = 0; strncat(result, p1, p2 - p1); return (int)(p2 - html + 1); } } } return -1; }