Beispiel #1
0
//Reads an inverted index file and recreates the data structure it represents
HashTable *ReadFile(char *file) {
	HashTable *reloadedIndex = calloc(1, sizeof(HashTable));			//allocate new index
	FILE *fp;

	fp = fopen(file, "r");								//open the input file
	char *line = calloc(100000, sizeof(char));

	//read the file line by line, parsing each line for the word, docids and freqs
	while (fgets(line, INT_MAX, fp) != NULL) {					
		line = strtok(line, "\n");
		char *token;
		char *word = calloc(100, sizeof(char));	
		token = strtok(line, " ");
		strcpy(word, token);
		token = strtok(NULL, " ");
		token = strtok(NULL, " ");
		
		//once word has been parsed and doccount has been skipped, start reading the docids and freqs until the end of the line
		while (token != NULL) {
			int doc_id = atoi(token);
			token = strtok(NULL, " ");
			int freq = atoi(token);
			int i = 0;
			
			//increment the frequency as many times as needed
			while (i < freq) {
				if (InHashTable(word, reloadedIndex) == 0) {
					AddToHashTable(word, reloadedIndex);
					UpdateHashTable(word, doc_id, reloadedIndex);
				}
				else {
					UpdateHashTable(word, doc_id, reloadedIndex);
				}
				i++;
			}
			token = strtok(NULL, " ");
		}
		free(token);
	}
	free(line);
	fclose(fp);	
	return reloadedIndex;	
}
/**
This function initializes the Elf members
@internalComponent
@released
*/
void ElfProducer::InitElfContents() {

	iElfHeader		= new Elf32_Ehdr;
	iSections		= new Elf32_Shdr[MAX_SECTIONS+1];

	iElfDynSym		= new Elf32_Sym[iNSymbols];
	iVersionTbl		= new Elf32_Half[iNSymbols];
	iVersionDef		= new Elf32_Verdef[2];
	iDSODaux		= new Elf32_Verdaux[2];

	iProgHeader		 = new Elf32_Phdr[2];
	iCodeSectionData = new PLUINT32[iNSymbols];	

	iHashTbl = new Elf32_HashTable;
	
	//premeditated
	iHashTbl->nBuckets = (iNSymbols /3) + (iNSymbols % 0x3);
	
	iHashTbl->nChains = iNSymbols;

	iDSOBuckets = new Elf32_Sword[iHashTbl->nBuckets];
	iDSOChains = new Elf32_Sword[iHashTbl->nChains];

	Elf32_Sword	aNullPtr = 0;

	memset(iDSOBuckets, aNullPtr, sizeof(Elf32_Sword)*iHashTbl->nBuckets);
	memset(iDSOChains,  aNullPtr, sizeof(Elf32_Sword)*iHashTbl->nChains);
	memset(iCodeSectionData,  0, sizeof(PLUINT32)*iNSymbols);

	CreateElfHeader();

	SymbolList::iterator aItr = iSymbolsList->begin();
	SymbolList::iterator aEnd = iSymbolsList->end();
	Symbol		*aSym;
	PLUINT32	aIdx = 1;

	memset( &iElfDynSym[0], 0, sizeof(Elf32_Sym));
	iDSOSymNameStrTbl.insert(iDSOSymNameStrTbl.end(), 0);

	while(aItr != aEnd) {
		String		aSymName("");
		aSym = *aItr;
		aSymName = aSym->SymbolName();
		//set symbol info..
		iElfDynSym[aIdx].st_name = iDSOSymNameStrTbl.size();

		iDSOSymNameStrTbl.insert(iDSOSymNameStrTbl.end(), aSymName.begin(), aSymName.end() );
		iDSOSymNameStrTbl.insert(iDSOSymNameStrTbl.end(), 0);

		SetSymolFields( aSym, &iElfDynSym[aIdx], aIdx);

		//set version table info...
		iVersionTbl[aIdx] = DEFAULT_VERSION;
		AddToHashTable(aSym->SymbolName(), aIdx);
		aItr++;aIdx++;
	}

	CreateVersionTable();
	
	//Fill section headers...
	CreateSections();

	//Copy dyn entries..
	CreateDynamicEntries();

	//create code section data - this has the ordinal numbers...
	CreateProgHeader();
}
Beispiel #3
0
int main(int argc, char* argv[])
{
    // check command line arguments
	
	// Check that there are three input parameters.
	if (argc != 4) {
		printf("Please input three parameters: seed URL, directory, and max depth.\n");
		return 1;
	}
	
	// Check that the seed url has proper domain (old-www).
	if (strncmp(argv[1], URL_PREFIX, 15) != 0) {
		printf("The seed URL domain must be old-www.\n");
		return 1;
	}
	
	// Check that the directory already exists.
	struct stat st;
	if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode));
	else { // If the directory does not exist, terminate the program.
		printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]);
		return 1;
	}
	
	// Check that the directory path does not have a '/' at the end for ease in writing filenames.
	if (argv[2][strlen(argv[2]) - 1] == '/') {
		printf("Please do not add '/' at the end of the directory path.\n");
		return 1;
	}
	
	// Check the third argument.
	// Loop through each letter of the first argument and check that it is indeed a number.
	for (int i = 0; i < strlen(argv[3]); i++) {
		if (!isdigit((int)argv[3][i])) {
 			printf("Please input a valid number for the depth.\n");
   			return 1;
   		}
	}
   	
  	sscanf(argv[3], "%d", &depth); // Store the argument as an integer.
	
	// Check that the depth specified does not exceed max depth.
	if (depth > MAX) {
		printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n");
		return 1;
	}
	
    	// init curl
    	curl_global_init(CURL_GLOBAL_ALL);

    	// setup seed page
	WebPage *page = calloc(1, sizeof(WebPage));
	page->url = (char *)malloc(strlen(argv[1]) + 1);
	MALLOC_CHECK(stderr, page->url); // Check that memory was allocated.
	strcpy(page->url, argv[1]); // Copy the seed url to page->url.

    	// get seed webpage
	if (!GetWebPage(page)) {
		printf("The seed URL is invald. Please enter a valid seed URL.\n");
		FreeWebMemory(page);
		return 1;
	}
	
	// Normalize the seed url.
	if (!NormalizeURL(page->url)) {
		printf("Seed URL cannot be normalized.\n");
		FreeWebMemory(page);
		return 1;
	}
	
    	// write seed file
    	strcpy(path, argv[2]); // Let var path contain the directory path.
    	WriteFile(page, path, pageID);
	
    	// add seed page to hashtable
    	if (!AddToHashTable(page->url)) {
		FreeWebMemory(page);
    		return 1;
   	 }
	
	// Initialize URLList.
	if (!InitList()) {
		FreeWebMemory(page);
		return 1;
	}
	
    	// extract urls from seed page
    	if (!CrawlPage(page)) {
    		FreeHashTable(); // Free all memory dynamically allocated to the hash table.
		FreeWebMemory(page);
    		return 1;
    	}

	// while there are urls to crawl
    	while (URLList.tail != NULL) {

    		// get next webpage from list
    		WebPage *next = PopList();

        	// write page file
        	pageID++;
		if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked.
			FreeWebMemory(next);
			return 1;
		}

        	// extract urls from webpage and then cleanup.
    		CrawlPage(next);
		FreeWebMemory(next);
    	}
    	// Memory cleanup.
	FreeHashTable(); // Free memory dynamically allocated to the hash table.
	FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable.
	
    	// cleanup curl
    	curl_global_cleanup();
    
    	return 0;
}
Beispiel #4
0
// Function to crawl a given webpage for links.
int CrawlPage(WebPage *wp) {
	
	char *result; // variable to hold the url.
    	int pos = 0; // position in each html page.
    	WebPage *newPage; // New webpage.
    
    	// Check that the depth does not exceed the depth passed.
    	if (wp->depth >= depth) {
    		return 0;
    	}
    
    	printf("\n");
    	printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled.
    	printf("\n");
    
    	// Loop through each html page to get all its urls.
    	while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) {
    
    		// Check that the url has proper domain (old-www).
		if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) {
			free(result);
			continue;
		}
 		
		// Normalize the url.
    		if (!NormalizeURL(result)) {
    			free(result);
    			continue;
    		}
    	
    		// Check that the url isn't already in the hash table.
    		if (!InHashTable(result)) {
    			AddToHashTable(result); // Add the url to the hash table.
    		
    			// Setup new page for each url.
			newPage = calloc(1, sizeof(WebPage));
			newPage->depth = wp->depth + 1;
			newPage->url = (char *)malloc(strlen(result) + 1);
			if (!newPage->url) { // Check that memory was allocated.
				continue;
			}
			strcpy(newPage->url, result);

			// Get html for each url.
			if (!GetWebPage(newPage)) {
				FreeWebMemory(newPage);
				free(result);
				continue;
			}
			
			printf("[crawler]: Parser found link - %s\n", result);
			
			// Add to the list of webpages to be visited.
			if (!AppendList(newPage)) {
				free(result);
				return 0;
			}
    		}
    		free(result);
    	}
	return 1;
}
Beispiel #5
0
int main(int argc, char* argv[]) {
	//check argument number
	if (argc < 3 || argc > 4) {
		printf("too many or too little arguments, please try again");
		exit(0);
	}
	
	//check directory validity
	if (!IsDir(argv[1])) {
		printf("invalid directory, please try again");
		exit(0);
	}
	
	//Initialize variables and index
	int docId;
	int pos;
	char *doc;
	char **filenames = NULL;
	int num_files = 0;
	HashTable *WordsFound = calloc(1, sizeof(HashTable));
	num_files = GetFilenamesInDir(argv[1], &filenames);

	//check whether the folder has files
	if (num_files < 0) {
		printf("failed to get any filenames");
		exit(0);
	}

	//iterate through each file in the directory
	for (int i = 0; i < num_files; i++) {
		
		//check that the file is in the correct format (title is a number)
		int filechecker = 0;
		for (int c = 0; c < strlen(filenames[i]); c++) {
			if (!isdigit(filenames[i][c])) {
				filechecker = 1;
			}
		}
		if (filechecker == 1) {
			continue;
		}

		//Load the document
		char *word;
		char file[100];
		strcpy(file, argv[1]);
		strcat(file, filenames[i]);
		doc = LoadDocument(file);
		docId = GetDocumentId(filenames[i]);
		free(filenames[i]);
		
		pos = 0;
		//Iterate through each word in the html file (doc)
		while ((pos = GetNextWord(doc, pos, &word)) > 0) {
			NormalizeWord(word);
			if (InHashTable(word, WordsFound) == 0) {
				AddToHashTable(word, WordsFound);
				UpdateHashTable(word, docId, WordsFound);
			}
			else {
				UpdateHashTable(word, docId, WordsFound);
				free(word);
			}
		}
		free(doc);
	}	
	free(filenames);
	SaveIndexToFile(argv[2], WordsFound);				//Save the index to the file specified
	FreeHashTable(WordsFound);

	//only proceed if there was a third argument specified. If so, reload the index form the file you just created
	if (argc == 4) {
		HashTable *ReloadedIndex = ReadFile(argv[2]);
		SaveIndexToFile(argv[3], ReloadedIndex);
		FreeHashTable(ReloadedIndex);
	}
	return 0;
}