Ejemplo n.º 1
0
void parseHTML(WebPage *currpage, List *currlist, HashTable *currhash, int initdepth){
  int position = 0;
  char* newurl;// = NULL;
  char *end;
  //Parsed through the HTML file and gets every URL
  while((position=GetNextURL(currpage->html, position, currpage->url, &newurl)) > 0 ){
    //Normalized the URL and if it is bad, it will be freed right away
    if (NormalizeURL(newurl)){
      //Deals with internal references
      if((end=strchr(newurl,'#'))){
	*end='\0';
      }
      // checks to make sure that the URL has the defined domain
      if( strstr(newurl,URL_PREFIX) != NULL ){
	if ( insertHashTable(currhash,newurl) == 0 ){
	  //Creates a new webpage for the found url and adds it to the our List if it successfully adds to the hashtable
	  char* dummyurl = (char *)malloc(strlen(newurl)+1);
	  strcpy(dummyurl,newurl);
	  WebPage *newpage=createWebPage(dummyurl, initdepth);
	  addtolist(currlist,newpage);
	  free(newurl);
	} else free(newurl);
      } else free(newurl);
    } else free(newurl);
  } 
  free(end);
}
Ejemplo n.º 2
0
void CrawlPage(WebPage webpage){
	char* nexturl= NULL;
	int lastpos = 0;
	int depth = webpage.depth + 1;
	
	if(depth > maxWebPageDepth) return;
	
	printf("\n\n[crawler]: Crawling - %s\n\n",webpage.url);
	
	while((lastpos = GetNextURL(webpage.html, lastpos, webpage.url, &nexturl))>0){
		NormalizeURL(nexturl);
		if(!CheckURL(nexturl)){
			// setup new page
			struct WebPage* newwebpage = (WebPage*)calloc(1,sizeof(WebPage));
			newwebpage->url = (char*)calloc(strlen(nexturl)+1, sizeof(char));
			strcpy(newwebpage->url,nexturl);
			newwebpage->depth = depth;
			
			// get new webpage
    		if(GetWebPage(newwebpage)){
    			if(HashTableInsert(nexturl)){											 // If not found in hash table, add to hash table
    				printf("[crawler]: Parser found new link - %s\n",nexturl);
    				struct ListNode* listentry = (ListNode*)calloc(1,sizeof(ListNode));							
    				listentry->page = newwebpage;									     // then add to list
					WebPageList->tail = InsertNode(WebPageList->tail,listentry);
					
	   				WriteFile(*newwebpage, filenum++); 									 // then write file
    			} else{
    				CleanUpPage(newwebpage);
    			}
			}
		}
		free(nexturl);
		nexturl = NULL;
		// Sleep for a second 
		sleep(INTERVAL_PER_FETCH);
	}
}
Ejemplo n.º 3
0
int main(int argc, char* argv[])
{
    
    //Check for the number of arguments
    if(argc != 4){
	    printf("Invalid Input Argument\n");
	    printHelp();
        exit(1);
    } 
   
    //direcotry file path
    int dirSize = strlen(argv[2]);
    char dir[dirSize + 1];
    dir[0] = '\0';
    strcat(dir, argv[2]);

    int urlSize = strlen(argv[1]);
    char inputURL[urlSize + 1];
    inputURL[0] = '\0';
    strcat(inputURL, argv[1]);

    //Get the max depth number.
    int inputDepth = atoi(argv[3]);

    //Check if correct depth is provided.
    if(inputDepth > 4 || inputDepth < 0){
        printf("Invalid [depth]\n");
        printHelp();
        exit(1);
    }
    //Check for URL validity 
    if(!strstr(inputURL,URL_PREFIX)){
 	    printf("Invalid input [seed url]\n");
        printHelp();
	    exit(1);
    }
    //checkf for directory location validity
    DIR* directory = opendir(dir);
    if(directory){
	    closedir(directory);
    }
    else if(ENOENT == errno){
	    printf("Directory does not exist\n");
	    printHelp();
        exit(1);
    }
    else{
	    printf("Directory can't be opened\n");
        printHelp();
	    exit(1);
    }

    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // setup seed page
    WebPage* seedWebPage = calloc(1, sizeof(WebPage));//Memory allocation for seed webpage
    seedWebPage->url = calloc((strlen(inputURL) + 1), sizeof(char));//Memory allocation to the seedURL
    seedWebPage->url[0] = '\0';
    strcat(seedWebPage->url, inputURL);
    seedWebPage->depth = 0;
    seedWebPage->html = NULL;
    
    //Initialize data structures
    HashTable* visitedURLHash = initHashTable();
    List* webPageList = initializeList();
    webPageList->head->page = seedWebPage;  
 
    //get seed webpage.
    if(GetWebPage(seedWebPage)){	
        // write seed file
        FILE *fPointer;
        char* pathVar1 = pathToDirectory(dir, fileNumber);
        fPointer = fopen(pathVar1, "w");
        free(pathVar1);	
        writeHTMLtoFile(fPointer, seedWebPage);
        //free(fPointer);
        
        if(inputDepth == 0){
            curl_global_cleanup();
            free(seedWebPage->html);
            free(seedWebPage->url);
            free(seedWebPage);

            //free webPageList and hashtable
            free(webPageList);
            for(int i = 0; i < MAX_HASH_SLOT; i++){
                free(visitedURLHash->table[i]->url);
                free(visitedURLHash->table[i]);
            }
            free(visitedURLHash);
            return 0;
        }   
        fileNumber += 1;
        depth += 1;
        HashTableInsert(visitedURLHash, seedWebPage->url); //mark as visited
        
        // extract urls from seed page
        char * result;
        int pos = 0;
        while((pos = GetNextURL(seedWebPage->html, pos, seedWebPage->url, &result))>0){

            if(NormalizeURL(result) && strstr(result,URL_PREFIX)){
                strtok(result, "#");
                //If not in hashtable, add it to the hashtable and add it to the webPageList.
                if(HashTableLookup(visitedURLHash, result) == 0){
                    HashTableInsert(visitedURLHash, result);
                    AppendList(webPageList, webPageInit(result, depth));
                    free(result);
                }
            }
        }
        if(webPageList->head->next->next == NULL){  //seed redirect case
            webPageList->head->next->page->depth = 0;
            fileNumber = 1;		
        }
        tempWebPage = PopList(webPageList); // Get rid of visited seedPage
    }
    else{	
        curl_global_cleanup();
        tempWebPage = PopList(webPageList);
        free(seedWebPage->html);
        free(seedWebPage->url);
        free(seedWebPage);
        //free(tempWebPage);
        free(webPageList);
        for(int i = 0; i < MAX_HASH_SLOT; i++){
            free(visitedURLHash->table[i]->url);
            free(visitedURLHash->table[i]);
        }
        free(visitedURLHash);
        exit(1);
    }

    
    //while there are urls to crawl
    while(webPageList->head != NULL && webPageList->tail != NULL){
        // get webpage for url
        tempWebPage = PopList(webPageList);
        if(GetWebPage(tempWebPage)){ 
            // write page file
            char* pathVar = pathToDirectory(dir, fileNumber);
            FILE *fPointer = fopen(pathVar, "w");
            free(pathVar);
            printf("Found link: %s\n",tempWebPage->url);
            writeHTMLtoFile(fPointer, tempWebPage);
            fileNumber += 1;
                
            if((tempWebPage->depth + 1) <= inputDepth ){
                char * resultTemp;
                int posTemp = 0;
                while((posTemp = GetNextURL(tempWebPage->html, posTemp, tempWebPage->url, &resultTemp))>0){
                    
                    if( NormalizeURL(resultTemp) && strstr(resultTemp,URL_PREFIX) ){
                        strtok(resultTemp, "#");
                        //insert to the hashtable and the webPageList if not already present
                        if(HashTableLookup(visitedURLHash, resultTemp) == 0){
                            HashTableInsert(visitedURLHash, resultTemp);
                            AppendList(webPageList, webPageInit(resultTemp, tempWebPage->depth+1));
                        }
                    }
                        free(resultTemp);
                }
            }
        
            free(tempWebPage->url);
            free(tempWebPage->html);
            free(tempWebPage);
        }
        else{
            free(tempWebPage->url);
            free(tempWebPage->html);
            free(tempWebPage);
        }
        sleep(INTERVAL_PER_FETCH);
    }
    // cleanup curl
    curl_global_cleanup();
    free(seedWebPage->url);
    free(seedWebPage->html);
    free(seedWebPage);
    free(webPageList);

    //free the hashtable
    for(int i = 0; i < MAX_HASH_SLOT; i++){
        if(visitedURLHash->table[i]->url != NULL){
            HashTableNode* currNode = visitedURLHash->table[i];
            while(currNode->next != NULL){
                HashTableNode* tempNode = currNode;
                currNode = currNode->next;
                free(tempNode->url);
                free(tempNode);
            }
            free(currNode->url);
            free(currNode);		
        }
        else{	
            free(visitedURLHash->table[i]);
        }
    }
    free(visitedURLHash);
    return 0;
}
Ejemplo n.º 4
0
// Function to crawl a given webpage for links.
int CrawlPage(WebPage *wp) {
	
	char *result; // variable to hold the url.
    	int pos = 0; // position in each html page.
    	WebPage *newPage; // New webpage.
    
    	// Check that the depth does not exceed the depth passed.
    	if (wp->depth >= depth) {
    		return 0;
    	}
    
    	printf("\n");
    	printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled.
    	printf("\n");
    
    	// Loop through each html page to get all its urls.
    	while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) {
    
    		// Check that the url has proper domain (old-www).
		if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) {
			free(result);
			continue;
		}
 		
		// Normalize the url.
    		if (!NormalizeURL(result)) {
    			free(result);
    			continue;
    		}
    	
    		// Check that the url isn't already in the hash table.
    		if (!InHashTable(result)) {
    			AddToHashTable(result); // Add the url to the hash table.
    		
    			// Setup new page for each url.
			newPage = calloc(1, sizeof(WebPage));
			newPage->depth = wp->depth + 1;
			newPage->url = (char *)malloc(strlen(result) + 1);
			if (!newPage->url) { // Check that memory was allocated.
				continue;
			}
			strcpy(newPage->url, result);

			// Get html for each url.
			if (!GetWebPage(newPage)) {
				FreeWebMemory(newPage);
				free(result);
				continue;
			}
			
			printf("[crawler]: Parser found link - %s\n", result);
			
			// Add to the list of webpages to be visited.
			if (!AppendList(newPage)) {
				free(result);
				return 0;
			}
    		}
    		free(result);
    	}
	return 1;
}
// main crawler function
int main(int argc, char* argv[]) {

    // local variables
    FILE *fp; // file pointer for html files
    char *nextURL; // pointer to the next URL found on the seed page
    char *newURL; // pointer to the next URL in the while loop

    // check command line arguments
    if (argc != 4) {
        printf("Incorrect number of arguments provided.");
        exit(1);
    }
    // check that the second argument is a directory
    stat(argv[2],&statbuffer);
    if S_ISDIR(statbuffer.st_mode) { }
    else {
        printf("Error, you did not supply a valid directory");
        exit(1);
    }

    // get arguments
    char *seedURL = argv[1];
    int filename_len = strlen(argv[2])+21;

    // get the directory
    char*filename = calloc(filename_len,sizeof(char));

    // check the maxDepth
    int value = is_numeric(argv[3]);
    if (value != 0) {
        sscanf(argv[3],"%i",&maxDepth);
    }
    else {
        printf("Error! maxDepth must be a number");
        exit(1);
    }

    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // initialize data structures/variables

    // initialize hashtable
    HashTable *table = malloc(sizeof(HashTable));
    memset(table,0,MAX_HASH_SLOT);

    // initialize linked list
    List *WebPageList;
    WebPageList = createList();

    // setup seed page

    // get seed webpage
    // if it fails, report and exit
    if (NormalizeURL(seedURL) == 0) {
        printf("Error, bad URL");
        exit(1);
    }
    // write seed file

    // create WebPage object by allocating memory
    WebPage *seedPage = malloc(sizeof(WebPage));

    // assign values to each part of the struct
    seedPage->url = seedURL;
    seedPage->html = NULL;
    seedPage->html_len = 0;
    seedPage->depth = 0;

    // try to get the webpage up to MAX_TRY times
    if (!GetWebPage(seedPage)) {
        for (tries = 0; tries < MAX_TRY; tries++) {
            if (GetWebPage(seedPage)) {
                break;
            }
        }
    }

    // write html contents to a file "1" in the given directory
    sprintf(filename,"%s/%d",argv[2],1);
    fp = fopen(filename,"w");
    fputs(seedURL,fp);
    fputs("\n",fp);
    fprintf(fp,"%d\n",seedPage->depth);
    fputs(seedPage->html,fp);

    // close the file and wipe the filename
    fclose(fp);
    memset(filename,'\0',filename_len);

    // add seed page to hashtable
    add(table,seedURL);

    // extract urls from seed page

    // while there are still URLs in the seed page's html
    while ((pos = GetNextURL(seedPage->html,pos,seedPage->url,&nextURL)) > 0) {

        // only visiting them if it wouldn't exceed maxDepth
        if ((seedPage->depth+1) > maxDepth) {
            free(seedPage);
            exit(1);
        }

        // ensure it's a valid url
        if (NormalizeURL(nextURL) != 0) {

            // also check if its in the right domain
            if (strncmp(URL_PREFIX,nextURL,strlen(URL_PREFIX)) == 0) {

                // if it is added to the hashtable it is a unique URL that
                // hasn't been visited before, add it to the linked list
                // of URLs to visit
                if (add(table,nextURL)) {
                    // create a new webpage object
                    WebPage *pages = malloc(sizeof(WebPage));
                    pages->url = nextURL;
                    pages->html = NULL;
                    pages->html_len = 0;
                    pages->depth = 1;

                    // try to get the webpage up until the MAX_TRY
                    tries = 0;
                    if (!GetWebPage(pages)) {
                        for (tries = 0; tries < MAX_TRY; tries++) {
                            if (GetWebPage(pages)) {
                                break;
                            }
                        }
                    }

                    // add it to the linked list
                    addToEnd(WebPageList,pages);
                }
            }
        }
    }

    // while there are urls to crawl
    while (WebPageList->head != NULL) {
        // get next url from list
        WebPage *nextPage = malloc(sizeof(WebPage));
        nextPage = removeFromFront(WebPageList);

        // try to get the webpage up until the MAX_TRY
        tries = 0;
        if (!GetWebPage(nextPage)) {
            for (tries = 0; tries < MAX_TRY; tries++) {
                if (GetWebPage(nextPage)) {
                    break;
                }
            }
        }

        // write page file
        sprintf(filename,"%s/%d",argv[2],docNum);
        fp = fopen(filename,"w");
        fputs(nextPage->url,fp);
        fputs("\n",fp);
        fprintf(fp,"%d\n",nextPage->depth);
        fputs(nextPage->html,fp);

        // close the file and wipe the filename (to be used next time)
        fclose(fp);
        memset(filename,'\0',filename_len);

        // increment the doc num
        docNum++;

        // check if visiting the URLs on this page will exceed maxDepth
        if ((nextPage->depth+1) > maxDepth) {
            free(nextPage);
            continue;
        }
        pos = 0;
        // iterate through all the URLs on the page
        while ((pos = GetNextURL(nextPage->html,pos,nextPage->url,&newURL))>0) {
            // check to ensure that the URLs are the proper format
            if (NormalizeURL(newURL) != 0 ) {
                // check to ensure that they are in the right domain
                if (strncmp(URL_PREFIX,newURL,strlen(URL_PREFIX)) == 0) {
                    // making sure to only add new ones to the list
                    if (add(table,newURL) != 0) {
                        // create a new WebPage object
                        WebPage *page = malloc(sizeof(WebPage));
                        page->url = newURL;
                        page->html = NULL;
                        page->html_len = 0;
                        page->depth = nextPage->depth + 1;
                        GetWebPage(page);

                        // try to get the webpage up until the MAX_TRY
                        tries = 0;
                        if (!GetWebPage(page)) {
                            for (tries = 0; tries < MAX_TRY; tries++) {
                                if (GetWebPage(page)) {
                                    break;
                                }
                            }
                        }

                        // add the page to the linked list
                        addToEnd(WebPageList,page);
                    }
                }
            }
        }
        // Sleep for a bit to avoid annoying the target
        sleep(INTERVAL_PER_FETCH);

        // Free resources
        free(nextPage);

    }

    // cleanup curl
    curl_global_cleanup();

    // free resources
    // free hashtable
    hash = JenkinsHash(seedURL,MAX_HASH_SLOT);
    HashTableNode *freer = table->table[hash];
    HashTableNode *tempHash = NULL;
    while (freer != NULL) {
        tempHash = freer;
        freer = freer->next;
        free(tempHash);
    }
    free(table);

    // free linked list
    free(WebPageList);

    // free WebPage and filename pointer
    free(seedPage);
    free(filename);
    return 0;
}
Ejemplo n.º 6
0
int GetNextURL(char *html, char *urlofthispage, char *result, int pos) {
  char c;
  int len, i, j;
  char *p1;  //!< pointer pointed to the start of a new-founded URL.
  char *p2;  //!< pointer pointed to the end of a new-founded URL.

  // NEW
  // Clean up \n chars
  if(pos == 0) {
    removeWhiteSpace(html);
  }
  // /NEW

  // Find the <a> <A> HTML tag.
  while (0 != (c = html[pos])) {
    if ((c=='<') &&
        ((html[pos+1] == 'a') || (html[pos+1] == 'A'))) {
      break;
    }
    pos++;
  }
  //! Find the URL it the HTML tag. They usually look like <a href="www.cs.dartmouth.edu">
  //! We try to find the quote mark in order to find the URL inside the quote mark.
  if (c) {
    
    // Added by Matt Mukerjee
    // check for equals first... some HTML tags don't have quotes...or use single quotes instead
    p1 = strchr(&(html[pos+1]), '=');
    
    if ((!p1) || (*(p1-1) == 'e') || ((p1 - html - pos) > 10)) {
      // keep going...
      return GetNextURL(html,urlofthispage,result,pos+1);
    }
    if (*(p1+1) == '\"' || *(p1+1) == '\'')
      p1++;
    // added by Matt Mukerjee
    p1++;    
    // added by Matt Mukerjee
    p2 = strpbrk(p1, "\'\">");
    if (!p2) {
      // Added by Matt Mukerjee
      // keep going...
      return GetNextURL(html,urlofthispage,result,pos+1);
    }
    if (*p1 == '#') { // Why bother returning anything here....recursively keep going...
      // Added by Matt Mukerjee      
      return GetNextURL(html,urlofthispage,result,pos+1);
    }
    if (!strncmp(p1, "mailto:",7))
      return GetNextURL(html, urlofthispage, result, pos+1);
    if (!strncmp(p1, "javascript:",11))   // Added by Xiaochao 
      return GetNextURL(html, urlofthispage, result, pos+1);
    if (!strncmp(p1, "http", 4) || !strncmp(p1, "HTTP", 4)) {
      //! Nice! The URL we found is in absolute path.
      strncpy(result, p1, (p2-p1));
      return  (int)(p2 - html + 1);
    } else {
      //! We find a URL. HTML is a terrible standard. So there are many ways to present a URL.
      if (p1[0] == '.') {
        //! Some URLs are like <a href="../../../a.txt"> I cannot handle this. 
	// again...probably good to recursively keep going..
	// NEW
        
        return GetNextURL(html,urlofthispage,result,pos+1);
	// /NEW
      }
      if (p1[0] == '/') {
        //! this means the URL is the absolute path
        for (i = 7; i < strlen(urlofthispage); i++)
          if (urlofthispage[i] == '/')
            break;
        strcpy(result, urlofthispage);
        result[i] = 0;
        strncat(result, p1, (p2 - p1));
        return (int)(p2 - html + 1);        
      } else {
        //! the URL is a absolute path.
        len = strlen(urlofthispage);
        for (i = (len - 1); i >= 0; i--)
          if (urlofthispage[i] == '/')
            break;
        for (j = (len - 1); j >= 0; j--)
          if (urlofthispage[j] == '.')
              break;
        if (i == (len -1)) {
          //! urlofthis page is like http://www.cs.dartmouth.edu/
            strcpy(result, urlofthispage);
            result[i + 1] = 0;
            strncat(result, p1, p2 - p1);
            return (int)(p2 - html + 1);
        }
        if ((i <= 6)||(i > j)) {
          //! urlofthispage is like http://www.cs.dartmouth.edu/~abc
          //! or http://www.cs.dartmouth.edu
          strcpy(result, urlofthispage);
          result[len] = '/';
          strncat(result, p1, p2 - p1);
          return (int)(p2 - html + 1);
        }
        strcpy(result, urlofthispage);
        result[i + 1] = 0;
        strncat(result, p1, p2 - p1);
        return (int)(p2 - html + 1);
      }
    }
  }    
  return -1;
}