//Reads an inverted index file and recreates the data structure it represents HashTable *ReadFile(char *file) { HashTable *reloadedIndex = calloc(1, sizeof(HashTable)); //allocate new index FILE *fp; fp = fopen(file, "r"); //open the input file char *line = calloc(100000, sizeof(char)); //read the file line by line, parsing each line for the word, docids and freqs while (fgets(line, INT_MAX, fp) != NULL) { line = strtok(line, "\n"); char *token; char *word = calloc(100, sizeof(char)); token = strtok(line, " "); strcpy(word, token); token = strtok(NULL, " "); token = strtok(NULL, " "); //once word has been parsed and doccount has been skipped, start reading the docids and freqs until the end of the line while (token != NULL) { int doc_id = atoi(token); token = strtok(NULL, " "); int freq = atoi(token); int i = 0; //increment the frequency as many times as needed while (i < freq) { if (InHashTable(word, reloadedIndex) == 0) { AddToHashTable(word, reloadedIndex); UpdateHashTable(word, doc_id, reloadedIndex); } else { UpdateHashTable(word, doc_id, reloadedIndex); } i++; } token = strtok(NULL, " "); } free(token); } free(line); fclose(fp); return reloadedIndex; }
/** This function initializes the Elf members @internalComponent @released */ void ElfProducer::InitElfContents() { iElfHeader = new Elf32_Ehdr; iSections = new Elf32_Shdr[MAX_SECTIONS+1]; iElfDynSym = new Elf32_Sym[iNSymbols]; iVersionTbl = new Elf32_Half[iNSymbols]; iVersionDef = new Elf32_Verdef[2]; iDSODaux = new Elf32_Verdaux[2]; iProgHeader = new Elf32_Phdr[2]; iCodeSectionData = new PLUINT32[iNSymbols]; iHashTbl = new Elf32_HashTable; //premeditated iHashTbl->nBuckets = (iNSymbols /3) + (iNSymbols % 0x3); iHashTbl->nChains = iNSymbols; iDSOBuckets = new Elf32_Sword[iHashTbl->nBuckets]; iDSOChains = new Elf32_Sword[iHashTbl->nChains]; Elf32_Sword aNullPtr = 0; memset(iDSOBuckets, aNullPtr, sizeof(Elf32_Sword)*iHashTbl->nBuckets); memset(iDSOChains, aNullPtr, sizeof(Elf32_Sword)*iHashTbl->nChains); memset(iCodeSectionData, 0, sizeof(PLUINT32)*iNSymbols); CreateElfHeader(); SymbolList::iterator aItr = iSymbolsList->begin(); SymbolList::iterator aEnd = iSymbolsList->end(); Symbol *aSym; PLUINT32 aIdx = 1; memset( &iElfDynSym[0], 0, sizeof(Elf32_Sym)); iDSOSymNameStrTbl.insert(iDSOSymNameStrTbl.end(), 0); while(aItr != aEnd) { String aSymName(""); aSym = *aItr; aSymName = aSym->SymbolName(); //set symbol info.. iElfDynSym[aIdx].st_name = iDSOSymNameStrTbl.size(); iDSOSymNameStrTbl.insert(iDSOSymNameStrTbl.end(), aSymName.begin(), aSymName.end() ); iDSOSymNameStrTbl.insert(iDSOSymNameStrTbl.end(), 0); SetSymolFields( aSym, &iElfDynSym[aIdx], aIdx); //set version table info... iVersionTbl[aIdx] = DEFAULT_VERSION; AddToHashTable(aSym->SymbolName(), aIdx); aItr++;aIdx++; } CreateVersionTable(); //Fill section headers... CreateSections(); //Copy dyn entries.. CreateDynamicEntries(); //create code section data - this has the ordinal numbers... CreateProgHeader(); }
int main(int argc, char* argv[]) { // check command line arguments // Check that there are three input parameters. if (argc != 4) { printf("Please input three parameters: seed URL, directory, and max depth.\n"); return 1; } // Check that the seed url has proper domain (old-www). if (strncmp(argv[1], URL_PREFIX, 15) != 0) { printf("The seed URL domain must be old-www.\n"); return 1; } // Check that the directory already exists. struct stat st; if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode)); else { // If the directory does not exist, terminate the program. printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]); return 1; } // Check that the directory path does not have a '/' at the end for ease in writing filenames. if (argv[2][strlen(argv[2]) - 1] == '/') { printf("Please do not add '/' at the end of the directory path.\n"); return 1; } // Check the third argument. // Loop through each letter of the first argument and check that it is indeed a number. for (int i = 0; i < strlen(argv[3]); i++) { if (!isdigit((int)argv[3][i])) { printf("Please input a valid number for the depth.\n"); return 1; } } sscanf(argv[3], "%d", &depth); // Store the argument as an integer. // Check that the depth specified does not exceed max depth. if (depth > MAX) { printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n"); return 1; } // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage *page = calloc(1, sizeof(WebPage)); page->url = (char *)malloc(strlen(argv[1]) + 1); MALLOC_CHECK(stderr, page->url); // Check that memory was allocated. strcpy(page->url, argv[1]); // Copy the seed url to page->url. // get seed webpage if (!GetWebPage(page)) { printf("The seed URL is invald. Please enter a valid seed URL.\n"); FreeWebMemory(page); return 1; } // Normalize the seed url. if (!NormalizeURL(page->url)) { printf("Seed URL cannot be normalized.\n"); FreeWebMemory(page); return 1; } // write seed file strcpy(path, argv[2]); // Let var path contain the directory path. WriteFile(page, path, pageID); // add seed page to hashtable if (!AddToHashTable(page->url)) { FreeWebMemory(page); return 1; } // Initialize URLList. if (!InitList()) { FreeWebMemory(page); return 1; } // extract urls from seed page if (!CrawlPage(page)) { FreeHashTable(); // Free all memory dynamically allocated to the hash table. FreeWebMemory(page); return 1; } // while there are urls to crawl while (URLList.tail != NULL) { // get next webpage from list WebPage *next = PopList(); // write page file pageID++; if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked. FreeWebMemory(next); return 1; } // extract urls from webpage and then cleanup. CrawlPage(next); FreeWebMemory(next); } // Memory cleanup. FreeHashTable(); // Free memory dynamically allocated to the hash table. FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable. // cleanup curl curl_global_cleanup(); return 0; }
// Function to crawl a given webpage for links. int CrawlPage(WebPage *wp) { char *result; // variable to hold the url. int pos = 0; // position in each html page. WebPage *newPage; // New webpage. // Check that the depth does not exceed the depth passed. if (wp->depth >= depth) { return 0; } printf("\n"); printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled. printf("\n"); // Loop through each html page to get all its urls. while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) { // Check that the url has proper domain (old-www). if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) { free(result); continue; } // Normalize the url. if (!NormalizeURL(result)) { free(result); continue; } // Check that the url isn't already in the hash table. if (!InHashTable(result)) { AddToHashTable(result); // Add the url to the hash table. // Setup new page for each url. newPage = calloc(1, sizeof(WebPage)); newPage->depth = wp->depth + 1; newPage->url = (char *)malloc(strlen(result) + 1); if (!newPage->url) { // Check that memory was allocated. continue; } strcpy(newPage->url, result); // Get html for each url. if (!GetWebPage(newPage)) { FreeWebMemory(newPage); free(result); continue; } printf("[crawler]: Parser found link - %s\n", result); // Add to the list of webpages to be visited. if (!AppendList(newPage)) { free(result); return 0; } } free(result); } return 1; }
int main(int argc, char* argv[]) { //check argument number if (argc < 3 || argc > 4) { printf("too many or too little arguments, please try again"); exit(0); } //check directory validity if (!IsDir(argv[1])) { printf("invalid directory, please try again"); exit(0); } //Initialize variables and index int docId; int pos; char *doc; char **filenames = NULL; int num_files = 0; HashTable *WordsFound = calloc(1, sizeof(HashTable)); num_files = GetFilenamesInDir(argv[1], &filenames); //check whether the folder has files if (num_files < 0) { printf("failed to get any filenames"); exit(0); } //iterate through each file in the directory for (int i = 0; i < num_files; i++) { //check that the file is in the correct format (title is a number) int filechecker = 0; for (int c = 0; c < strlen(filenames[i]); c++) { if (!isdigit(filenames[i][c])) { filechecker = 1; } } if (filechecker == 1) { continue; } //Load the document char *word; char file[100]; strcpy(file, argv[1]); strcat(file, filenames[i]); doc = LoadDocument(file); docId = GetDocumentId(filenames[i]); free(filenames[i]); pos = 0; //Iterate through each word in the html file (doc) while ((pos = GetNextWord(doc, pos, &word)) > 0) { NormalizeWord(word); if (InHashTable(word, WordsFound) == 0) { AddToHashTable(word, WordsFound); UpdateHashTable(word, docId, WordsFound); } else { UpdateHashTable(word, docId, WordsFound); free(word); } } free(doc); } free(filenames); SaveIndexToFile(argv[2], WordsFound); //Save the index to the file specified FreeHashTable(WordsFound); //only proceed if there was a third argument specified. If so, reload the index form the file you just created if (argc == 4) { HashTable *ReloadedIndex = ReadFile(argv[2]); SaveIndexToFile(argv[3], ReloadedIndex); FreeHashTable(ReloadedIndex); } return 0; }