int main(int argc, char* argv[]) { // check command line arguments // Check that there are three input parameters. if (argc != 4) { printf("Please input three parameters: seed URL, directory, and max depth.\n"); return 1; } // Check that the seed url has proper domain (old-www). if (strncmp(argv[1], URL_PREFIX, 15) != 0) { printf("The seed URL domain must be old-www.\n"); return 1; } // Check that the directory already exists. struct stat st; if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode)); else { // If the directory does not exist, terminate the program. printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]); return 1; } // Check that the directory path does not have a '/' at the end for ease in writing filenames. if (argv[2][strlen(argv[2]) - 1] == '/') { printf("Please do not add '/' at the end of the directory path.\n"); return 1; } // Check the third argument. // Loop through each letter of the first argument and check that it is indeed a number. for (int i = 0; i < strlen(argv[3]); i++) { if (!isdigit((int)argv[3][i])) { printf("Please input a valid number for the depth.\n"); return 1; } } sscanf(argv[3], "%d", &depth); // Store the argument as an integer. // Check that the depth specified does not exceed max depth. if (depth > MAX) { printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n"); return 1; } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage *page = calloc(1, sizeof(WebPage)); page->url = (char *)malloc(strlen(argv[1]) + 1); MALLOC_CHECK(stderr, page->url); // Check that memory was allocated. strcpy(page->url, argv[1]); // Copy the seed url to page->url. // get seed webpage if (!GetWebPage(page)) { printf("The seed URL is invald. Please enter a valid seed URL.\n"); FreeWebMemory(page); return 1; } // Normalize the seed url. if (!NormalizeURL(page->url)) { printf("Seed URL cannot be normalized.\n"); FreeWebMemory(page); return 1; } // write seed file strcpy(path, argv[2]); // Let var path contain the directory path. WriteFile(page, path, pageID); // add seed page to hashtable if (!AddToHashTable(page->url)) { FreeWebMemory(page); return 1; } // Initialize URLList. if (!InitList()) { FreeWebMemory(page); return 1; } // extract urls from seed page if (!CrawlPage(page)) { FreeHashTable(); // Free all memory dynamically allocated to the hash table. FreeWebMemory(page); return 1; } // while there are urls to crawl while (URLList.tail != NULL) { // get next webpage from list WebPage *next = PopList(); // write page file pageID++; if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked. FreeWebMemory(next); return 1; } // extract urls from webpage and then cleanup. CrawlPage(next); FreeWebMemory(next); } // Memory cleanup. FreeHashTable(); // Free memory dynamically allocated to the hash table. FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable. // cleanup curl curl_global_cleanup(); return 0; }
int main(int argc, char* argv[]) { filenum = 1; // check command line arguments if(argc != 4){ // check number of arguments fprintf(stderr,"Error: Number of input argument needs to be exactly 3\n"); return -1; }else if (CheckDirectory(argv[2])){ // check if directory exist return -1; }else if(CheckDepth(argv[3])){ // check depth return -1; }else if(CheckURL(argv[1])){ // check url fprintf(stderr,"Error: Invalid URL. Can only crawl URL with URL prefix %s\n",URL_PREFIX); return -1; } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page seedPage = (WebPage*)calloc(1,sizeof(WebPage)); NormalizeURL(seedURL); seedPage->url = (char*)calloc(strlen(seedURL)+1,sizeof(char)); strcpy(seedPage->url,seedURL); seedPage->depth = 0; // get seed webpage if(!GetWebPage(seedPage)){ // clean up and exit if url is invalid fprintf(stderr,"Error: Invalid URL\n"); curl_global_cleanup(); return -1; } // write seed file WriteFile(*seedPage, filenum++); // Exit if maxWebPageDepth = 0 if(maxWebPageDepth == 0){ // clean up and exit if max webpage depth is 0 printf("\n[crawler]: Crawling - %s\n\n",seedPage->url); printf("1 page has been crawled \n\n"); CleanUpPage(seedPage); CleanUpHash(URLsVisited); curl_global_cleanup(); return 0; } // add seed page to hashtable InitialiseHashTable(URLsVisited); HashTableInsert(seedURL); // add seed node to list WebPageList = (List*)calloc(1,sizeof(List)); struct ListNode* seednode = (ListNode*)calloc(1,sizeof(ListNode)); seednode->page = seedPage; WebPageList->head = seednode; WebPageList->tail = seednode; // extract urls from seed page CrawlPage(*seedPage); WebPageList->head = RemoveNode(WebPageList->head); // while there are urls to crawl while(WebPageList->head != NULL && WebPageList->head->page->depth < maxWebPageDepth){ // get next url from list, get webpage for url, write page file and extract urls from webpage CrawlPage(*(WebPageList->head->page)); WebPageList->head = RemoveNode(WebPageList->head); } // cleanup memory CleanUpList(WebPageList); CleanUpHash(URLsVisited); // cleanup curl curl_global_cleanup(); printf("\n\n %d webpages have been crawled\n\n", filenum-1); return 0; }