示例#1
0
int main(int argc, char* argv[])
{
    // check command line arguments
	
	// Check that there are three input parameters.
	if (argc != 4) {
		printf("Please input three parameters: seed URL, directory, and max depth.\n");
		return 1;
	}
	
	// Check that the seed url has proper domain (old-www).
	if (strncmp(argv[1], URL_PREFIX, 15) != 0) {
		printf("The seed URL domain must be old-www.\n");
		return 1;
	}
	
	// Check that the directory already exists.
	struct stat st;
	if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode));
	else { // If the directory does not exist, terminate the program.
		printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]);
		return 1;
	}
	
	// Check that the directory path does not have a '/' at the end for ease in writing filenames.
	if (argv[2][strlen(argv[2]) - 1] == '/') {
		printf("Please do not add '/' at the end of the directory path.\n");
		return 1;
	}
	
	// Check the third argument.
	// Loop through each letter of the first argument and check that it is indeed a number.
	for (int i = 0; i < strlen(argv[3]); i++) {
		if (!isdigit((int)argv[3][i])) {
 			printf("Please input a valid number for the depth.\n");
   			return 1;
   		}
	}
   	
  	sscanf(argv[3], "%d", &depth); // Store the argument as an integer.
	
	// Check that the depth specified does not exceed max depth.
	if (depth > MAX) {
		printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n");
		return 1;
	}
	
    	// init curl
    	curl_global_init(CURL_GLOBAL_ALL);

    	// setup seed page
	WebPage *page = calloc(1, sizeof(WebPage));
	page->url = (char *)malloc(strlen(argv[1]) + 1);
	MALLOC_CHECK(stderr, page->url); // Check that memory was allocated.
	strcpy(page->url, argv[1]); // Copy the seed url to page->url.

    	// get seed webpage
	if (!GetWebPage(page)) {
		printf("The seed URL is invald. Please enter a valid seed URL.\n");
		FreeWebMemory(page);
		return 1;
	}
	
	// Normalize the seed url.
	if (!NormalizeURL(page->url)) {
		printf("Seed URL cannot be normalized.\n");
		FreeWebMemory(page);
		return 1;
	}
	
    	// write seed file
    	strcpy(path, argv[2]); // Let var path contain the directory path.
    	WriteFile(page, path, pageID);
	
    	// add seed page to hashtable
    	if (!AddToHashTable(page->url)) {
		FreeWebMemory(page);
    		return 1;
   	 }
	
	// Initialize URLList.
	if (!InitList()) {
		FreeWebMemory(page);
		return 1;
	}
	
    	// extract urls from seed page
    	if (!CrawlPage(page)) {
    		FreeHashTable(); // Free all memory dynamically allocated to the hash table.
		FreeWebMemory(page);
    		return 1;
    	}

	// while there are urls to crawl
    	while (URLList.tail != NULL) {

    		// get next webpage from list
    		WebPage *next = PopList();

        	// write page file
        	pageID++;
		if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked.
			FreeWebMemory(next);
			return 1;
		}

        	// extract urls from webpage and then cleanup.
    		CrawlPage(next);
		FreeWebMemory(next);
    	}
    	// Memory cleanup.
	FreeHashTable(); // Free memory dynamically allocated to the hash table.
	FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable.
	
    	// cleanup curl
    	curl_global_cleanup();
    
    	return 0;
}
示例#2
0
int main(int argc, char* argv[])
{	
	filenum = 1;
	
    // check command line arguments
	if(argc != 4){		// check number of arguments
		fprintf(stderr,"Error: Number of input argument needs to be exactly 3\n");
		return -1;
	}else if (CheckDirectory(argv[2])){		// check if directory exist
		return -1;
	}else if(CheckDepth(argv[3])){			// check depth
		return -1;
	}else if(CheckURL(argv[1])){			// check url
		fprintf(stderr,"Error: Invalid URL. Can only crawl URL with URL prefix %s\n",URL_PREFIX);
		return -1;
	}
	
    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // setup seed page
	seedPage = (WebPage*)calloc(1,sizeof(WebPage));
	NormalizeURL(seedURL);
    seedPage->url = (char*)calloc(strlen(seedURL)+1,sizeof(char));
	strcpy(seedPage->url,seedURL);
	seedPage->depth = 0;
	
    // get seed webpage
    if(!GetWebPage(seedPage)){				// clean up and exit if url is invalid
    	fprintf(stderr,"Error: Invalid URL\n");
		curl_global_cleanup();
		return -1;
	}

    // write seed file
	WriteFile(*seedPage, filenum++);
	
	// Exit if maxWebPageDepth = 0
	if(maxWebPageDepth == 0){				// clean up and exit if max webpage depth is 0
		printf("\n[crawler]: Crawling - %s\n\n",seedPage->url);
		printf("1 page has been crawled \n\n");
		CleanUpPage(seedPage);	
		CleanUpHash(URLsVisited);	
		curl_global_cleanup();
		return 0;
	}	
	
    // add seed page to hashtable
   	InitialiseHashTable(URLsVisited);
    HashTableInsert(seedURL);

    // add seed node to list
    WebPageList = (List*)calloc(1,sizeof(List));
    struct ListNode* seednode = (ListNode*)calloc(1,sizeof(ListNode));
    seednode->page = seedPage;
    WebPageList->head = seednode;
    WebPageList->tail = seednode;
    
    // extract urls from seed page
    CrawlPage(*seedPage);
    WebPageList->head = RemoveNode(WebPageList->head);
    
    // while there are urls to crawl
   	while(WebPageList->head != NULL && WebPageList->head->page->depth < maxWebPageDepth){
        // get next url from list, get webpage for url, write page file and extract urls from webpage
		CrawlPage(*(WebPageList->head->page));
		WebPageList->head = RemoveNode(WebPageList->head);
	}
	
	// cleanup memory
	CleanUpList(WebPageList);
	CleanUpHash(URLsVisited);

    // cleanup curl
    curl_global_cleanup();
	
	printf("\n\n %d webpages have been crawled\n\n", filenum-1);
    return 0;
}