int main(int argc, char* argv[]) { //Check for the number of arguments if(argc != 4){ printf("Invalid Input Argument\n"); printHelp(); exit(1); } //direcotry file path int dirSize = strlen(argv[2]); char dir[dirSize + 1]; dir[0] = '\0'; strcat(dir, argv[2]); int urlSize = strlen(argv[1]); char inputURL[urlSize + 1]; inputURL[0] = '\0'; strcat(inputURL, argv[1]); //Get the max depth number. int inputDepth = atoi(argv[3]); //Check if correct depth is provided. if(inputDepth > 4 || inputDepth < 0){ printf("Invalid [depth]\n"); printHelp(); exit(1); } //Check for URL validity if(!strstr(inputURL,URL_PREFIX)){ printf("Invalid input [seed url]\n"); printHelp(); exit(1); } //checkf for directory location validity DIR* directory = opendir(dir); if(directory){ closedir(directory); } else if(ENOENT == errno){ printf("Directory does not exist\n"); printHelp(); exit(1); } else{ printf("Directory can't be opened\n"); printHelp(); exit(1); } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage* seedWebPage = calloc(1, sizeof(WebPage));//Memory allocation for seed webpage seedWebPage->url = calloc((strlen(inputURL) + 1), sizeof(char));//Memory allocation to the seedURL seedWebPage->url[0] = '\0'; strcat(seedWebPage->url, inputURL); seedWebPage->depth = 0; seedWebPage->html = NULL; //Initialize data structures HashTable* visitedURLHash = initHashTable(); List* webPageList = initializeList(); webPageList->head->page = seedWebPage; //get seed webpage. if(GetWebPage(seedWebPage)){ // write seed file FILE *fPointer; char* pathVar1 = pathToDirectory(dir, fileNumber); fPointer = fopen(pathVar1, "w"); free(pathVar1); writeHTMLtoFile(fPointer, seedWebPage); //free(fPointer); if(inputDepth == 0){ curl_global_cleanup(); free(seedWebPage->html); free(seedWebPage->url); free(seedWebPage); //free webPageList and hashtable free(webPageList); for(int i = 0; i < MAX_HASH_SLOT; i++){ free(visitedURLHash->table[i]->url); free(visitedURLHash->table[i]); } free(visitedURLHash); return 0; } fileNumber += 1; depth += 1; HashTableInsert(visitedURLHash, seedWebPage->url); //mark as visited // extract urls from seed page char * result; int pos = 0; while((pos = GetNextURL(seedWebPage->html, pos, seedWebPage->url, &result))>0){ if(NormalizeURL(result) && strstr(result,URL_PREFIX)){ strtok(result, "#"); //If not in hashtable, add it to the hashtable and add it to the webPageList. if(HashTableLookup(visitedURLHash, result) == 0){ HashTableInsert(visitedURLHash, result); AppendList(webPageList, webPageInit(result, depth)); free(result); } } } if(webPageList->head->next->next == NULL){ //seed redirect case webPageList->head->next->page->depth = 0; fileNumber = 1; } tempWebPage = PopList(webPageList); // Get rid of visited seedPage } else{ curl_global_cleanup(); tempWebPage = PopList(webPageList); free(seedWebPage->html); free(seedWebPage->url); free(seedWebPage); //free(tempWebPage); free(webPageList); for(int i = 0; i < MAX_HASH_SLOT; i++){ free(visitedURLHash->table[i]->url); free(visitedURLHash->table[i]); } free(visitedURLHash); exit(1); } //while there are urls to crawl while(webPageList->head != NULL && webPageList->tail != NULL){ // get webpage for url tempWebPage = PopList(webPageList); if(GetWebPage(tempWebPage)){ // write page file char* pathVar = pathToDirectory(dir, fileNumber); FILE *fPointer = fopen(pathVar, "w"); free(pathVar); printf("Found link: %s\n",tempWebPage->url); writeHTMLtoFile(fPointer, tempWebPage); fileNumber += 1; if((tempWebPage->depth + 1) <= inputDepth ){ char * resultTemp; int posTemp = 0; while((posTemp = GetNextURL(tempWebPage->html, posTemp, tempWebPage->url, &resultTemp))>0){ if( NormalizeURL(resultTemp) && strstr(resultTemp,URL_PREFIX) ){ strtok(resultTemp, "#"); //insert to the hashtable and the webPageList if not already present if(HashTableLookup(visitedURLHash, resultTemp) == 0){ HashTableInsert(visitedURLHash, resultTemp); AppendList(webPageList, webPageInit(resultTemp, tempWebPage->depth+1)); } } free(resultTemp); } } free(tempWebPage->url); free(tempWebPage->html); free(tempWebPage); } else{ free(tempWebPage->url); free(tempWebPage->html); free(tempWebPage); } sleep(INTERVAL_PER_FETCH); } // cleanup curl curl_global_cleanup(); free(seedWebPage->url); free(seedWebPage->html); free(seedWebPage); free(webPageList); //free the hashtable for(int i = 0; i < MAX_HASH_SLOT; i++){ if(visitedURLHash->table[i]->url != NULL){ HashTableNode* currNode = visitedURLHash->table[i]; while(currNode->next != NULL){ HashTableNode* tempNode = currNode; currNode = currNode->next; free(tempNode->url); free(tempNode); } free(currNode->url); free(currNode); } else{ free(visitedURLHash->table[i]); } } free(visitedURLHash); return 0; }
int main(int argc, char* argv[]) { // check command line arguments // Check that there are three input parameters. if (argc != 4) { printf("Please input three parameters: seed URL, directory, and max depth.\n"); return 1; } // Check that the seed url has proper domain (old-www). if (strncmp(argv[1], URL_PREFIX, 15) != 0) { printf("The seed URL domain must be old-www.\n"); return 1; } // Check that the directory already exists. struct stat st; if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode)); else { // If the directory does not exist, terminate the program. printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]); return 1; } // Check that the directory path does not have a '/' at the end for ease in writing filenames. if (argv[2][strlen(argv[2]) - 1] == '/') { printf("Please do not add '/' at the end of the directory path.\n"); return 1; } // Check the third argument. // Loop through each letter of the first argument and check that it is indeed a number. for (int i = 0; i < strlen(argv[3]); i++) { if (!isdigit((int)argv[3][i])) { printf("Please input a valid number for the depth.\n"); return 1; } } sscanf(argv[3], "%d", &depth); // Store the argument as an integer. // Check that the depth specified does not exceed max depth. if (depth > MAX) { printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n"); return 1; } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage *page = calloc(1, sizeof(WebPage)); page->url = (char *)malloc(strlen(argv[1]) + 1); MALLOC_CHECK(stderr, page->url); // Check that memory was allocated. strcpy(page->url, argv[1]); // Copy the seed url to page->url. // get seed webpage if (!GetWebPage(page)) { printf("The seed URL is invald. Please enter a valid seed URL.\n"); FreeWebMemory(page); return 1; } // Normalize the seed url. if (!NormalizeURL(page->url)) { printf("Seed URL cannot be normalized.\n"); FreeWebMemory(page); return 1; } // write seed file strcpy(path, argv[2]); // Let var path contain the directory path. WriteFile(page, path, pageID); // add seed page to hashtable if (!AddToHashTable(page->url)) { FreeWebMemory(page); return 1; } // Initialize URLList. if (!InitList()) { FreeWebMemory(page); return 1; } // extract urls from seed page if (!CrawlPage(page)) { FreeHashTable(); // Free all memory dynamically allocated to the hash table. FreeWebMemory(page); return 1; } // while there are urls to crawl while (URLList.tail != NULL) { // get next webpage from list WebPage *next = PopList(); // write page file pageID++; if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked. FreeWebMemory(next); return 1; } // extract urls from webpage and then cleanup. CrawlPage(next); FreeWebMemory(next); } // Memory cleanup. FreeHashTable(); // Free memory dynamically allocated to the hash table. FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable. // cleanup curl curl_global_cleanup(); return 0; }