Example #1
0
int main(int argc, char* argv[])
{
    
    //Check for the number of arguments
    if(argc != 4){
	    printf("Invalid Input Argument\n");
	    printHelp();
        exit(1);
    } 
   
    //direcotry file path
    int dirSize = strlen(argv[2]);
    char dir[dirSize + 1];
    dir[0] = '\0';
    strcat(dir, argv[2]);

    int urlSize = strlen(argv[1]);
    char inputURL[urlSize + 1];
    inputURL[0] = '\0';
    strcat(inputURL, argv[1]);

    //Get the max depth number.
    int inputDepth = atoi(argv[3]);

    //Check if correct depth is provided.
    if(inputDepth > 4 || inputDepth < 0){
        printf("Invalid [depth]\n");
        printHelp();
        exit(1);
    }
    //Check for URL validity 
    if(!strstr(inputURL,URL_PREFIX)){
 	    printf("Invalid input [seed url]\n");
        printHelp();
	    exit(1);
    }
    //checkf for directory location validity
    DIR* directory = opendir(dir);
    if(directory){
	    closedir(directory);
    }
    else if(ENOENT == errno){
	    printf("Directory does not exist\n");
	    printHelp();
        exit(1);
    }
    else{
	    printf("Directory can't be opened\n");
        printHelp();
	    exit(1);
    }

    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // setup seed page
    WebPage* seedWebPage = calloc(1, sizeof(WebPage));//Memory allocation for seed webpage
    seedWebPage->url = calloc((strlen(inputURL) + 1), sizeof(char));//Memory allocation to the seedURL
    seedWebPage->url[0] = '\0';
    strcat(seedWebPage->url, inputURL);
    seedWebPage->depth = 0;
    seedWebPage->html = NULL;
    
    //Initialize data structures
    HashTable* visitedURLHash = initHashTable();
    List* webPageList = initializeList();
    webPageList->head->page = seedWebPage;  
 
    //get seed webpage.
    if(GetWebPage(seedWebPage)){	
        // write seed file
        FILE *fPointer;
        char* pathVar1 = pathToDirectory(dir, fileNumber);
        fPointer = fopen(pathVar1, "w");
        free(pathVar1);	
        writeHTMLtoFile(fPointer, seedWebPage);
        //free(fPointer);
        
        if(inputDepth == 0){
            curl_global_cleanup();
            free(seedWebPage->html);
            free(seedWebPage->url);
            free(seedWebPage);

            //free webPageList and hashtable
            free(webPageList);
            for(int i = 0; i < MAX_HASH_SLOT; i++){
                free(visitedURLHash->table[i]->url);
                free(visitedURLHash->table[i]);
            }
            free(visitedURLHash);
            return 0;
        }   
        fileNumber += 1;
        depth += 1;
        HashTableInsert(visitedURLHash, seedWebPage->url); //mark as visited
        
        // extract urls from seed page
        char * result;
        int pos = 0;
        while((pos = GetNextURL(seedWebPage->html, pos, seedWebPage->url, &result))>0){

            if(NormalizeURL(result) && strstr(result,URL_PREFIX)){
                strtok(result, "#");
                //If not in hashtable, add it to the hashtable and add it to the webPageList.
                if(HashTableLookup(visitedURLHash, result) == 0){
                    HashTableInsert(visitedURLHash, result);
                    AppendList(webPageList, webPageInit(result, depth));
                    free(result);
                }
            }
        }
        if(webPageList->head->next->next == NULL){  //seed redirect case
            webPageList->head->next->page->depth = 0;
            fileNumber = 1;		
        }
        tempWebPage = PopList(webPageList); // Get rid of visited seedPage
    }
    else{	
        curl_global_cleanup();
        tempWebPage = PopList(webPageList);
        free(seedWebPage->html);
        free(seedWebPage->url);
        free(seedWebPage);
        //free(tempWebPage);
        free(webPageList);
        for(int i = 0; i < MAX_HASH_SLOT; i++){
            free(visitedURLHash->table[i]->url);
            free(visitedURLHash->table[i]);
        }
        free(visitedURLHash);
        exit(1);
    }

    
    //while there are urls to crawl
    while(webPageList->head != NULL && webPageList->tail != NULL){
        // get webpage for url
        tempWebPage = PopList(webPageList);
        if(GetWebPage(tempWebPage)){ 
            // write page file
            char* pathVar = pathToDirectory(dir, fileNumber);
            FILE *fPointer = fopen(pathVar, "w");
            free(pathVar);
            printf("Found link: %s\n",tempWebPage->url);
            writeHTMLtoFile(fPointer, tempWebPage);
            fileNumber += 1;
                
            if((tempWebPage->depth + 1) <= inputDepth ){
                char * resultTemp;
                int posTemp = 0;
                while((posTemp = GetNextURL(tempWebPage->html, posTemp, tempWebPage->url, &resultTemp))>0){
                    
                    if( NormalizeURL(resultTemp) && strstr(resultTemp,URL_PREFIX) ){
                        strtok(resultTemp, "#");
                        //insert to the hashtable and the webPageList if not already present
                        if(HashTableLookup(visitedURLHash, resultTemp) == 0){
                            HashTableInsert(visitedURLHash, resultTemp);
                            AppendList(webPageList, webPageInit(resultTemp, tempWebPage->depth+1));
                        }
                    }
                        free(resultTemp);
                }
            }
        
            free(tempWebPage->url);
            free(tempWebPage->html);
            free(tempWebPage);
        }
        else{
            free(tempWebPage->url);
            free(tempWebPage->html);
            free(tempWebPage);
        }
        sleep(INTERVAL_PER_FETCH);
    }
    // cleanup curl
    curl_global_cleanup();
    free(seedWebPage->url);
    free(seedWebPage->html);
    free(seedWebPage);
    free(webPageList);

    //free the hashtable
    for(int i = 0; i < MAX_HASH_SLOT; i++){
        if(visitedURLHash->table[i]->url != NULL){
            HashTableNode* currNode = visitedURLHash->table[i];
            while(currNode->next != NULL){
                HashTableNode* tempNode = currNode;
                currNode = currNode->next;
                free(tempNode->url);
                free(tempNode);
            }
            free(currNode->url);
            free(currNode);		
        }
        else{	
            free(visitedURLHash->table[i]);
        }
    }
    free(visitedURLHash);
    return 0;
}
Example #2
0
int main(int argc, char* argv[])
{
    // check command line arguments
	
	// Check that there are three input parameters.
	if (argc != 4) {
		printf("Please input three parameters: seed URL, directory, and max depth.\n");
		return 1;
	}
	
	// Check that the seed url has proper domain (old-www).
	if (strncmp(argv[1], URL_PREFIX, 15) != 0) {
		printf("The seed URL domain must be old-www.\n");
		return 1;
	}
	
	// Check that the directory already exists.
	struct stat st;
	if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode));
	else { // If the directory does not exist, terminate the program.
		printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]);
		return 1;
	}
	
	// Check that the directory path does not have a '/' at the end for ease in writing filenames.
	if (argv[2][strlen(argv[2]) - 1] == '/') {
		printf("Please do not add '/' at the end of the directory path.\n");
		return 1;
	}
	
	// Check the third argument.
	// Loop through each letter of the first argument and check that it is indeed a number.
	for (int i = 0; i < strlen(argv[3]); i++) {
		if (!isdigit((int)argv[3][i])) {
 			printf("Please input a valid number for the depth.\n");
   			return 1;
   		}
	}
   	
  	sscanf(argv[3], "%d", &depth); // Store the argument as an integer.
	
	// Check that the depth specified does not exceed max depth.
	if (depth > MAX) {
		printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n");
		return 1;
	}
	
    	// init curl
    	curl_global_init(CURL_GLOBAL_ALL);

    	// setup seed page
	WebPage *page = calloc(1, sizeof(WebPage));
	page->url = (char *)malloc(strlen(argv[1]) + 1);
	MALLOC_CHECK(stderr, page->url); // Check that memory was allocated.
	strcpy(page->url, argv[1]); // Copy the seed url to page->url.

    	// get seed webpage
	if (!GetWebPage(page)) {
		printf("The seed URL is invald. Please enter a valid seed URL.\n");
		FreeWebMemory(page);
		return 1;
	}
	
	// Normalize the seed url.
	if (!NormalizeURL(page->url)) {
		printf("Seed URL cannot be normalized.\n");
		FreeWebMemory(page);
		return 1;
	}
	
    	// write seed file
    	strcpy(path, argv[2]); // Let var path contain the directory path.
    	WriteFile(page, path, pageID);
	
    	// add seed page to hashtable
    	if (!AddToHashTable(page->url)) {
		FreeWebMemory(page);
    		return 1;
   	 }
	
	// Initialize URLList.
	if (!InitList()) {
		FreeWebMemory(page);
		return 1;
	}
	
    	// extract urls from seed page
    	if (!CrawlPage(page)) {
    		FreeHashTable(); // Free all memory dynamically allocated to the hash table.
		FreeWebMemory(page);
    		return 1;
    	}

	// while there are urls to crawl
    	while (URLList.tail != NULL) {

    		// get next webpage from list
    		WebPage *next = PopList();

        	// write page file
        	pageID++;
		if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked.
			FreeWebMemory(next);
			return 1;
		}

        	// extract urls from webpage and then cleanup.
    		CrawlPage(next);
		FreeWebMemory(next);
    	}
    	// Memory cleanup.
	FreeHashTable(); // Free memory dynamically allocated to the hash table.
	FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable.
	
    	// cleanup curl
    	curl_global_cleanup();
    
    	return 0;
}