/*----------------------------------------------------------------------
|   PLT_DeviceData::GetIconUrl
+---------------------------------------------------------------------*/
NPT_String
PLT_DeviceData::GetIconUrl(const char* mimetype, 
                           NPT_Int32   maxsize, 
                           NPT_Int32   maxdepth)
{
    PLT_DeviceIcon icon;

    for (NPT_Cardinal i=0; i<m_Icons.GetItemCount(); i++) {
        if ((mimetype && m_Icons[i].m_MimeType != mimetype) ||
            (maxsize  && m_Icons[i].m_Width > maxsize)      ||
            (maxsize  && m_Icons[i].m_Height > maxsize)     ||
            (maxdepth && m_Icons[i].m_Depth > maxdepth))
            continue;

        // pick the biggest and better resolution we can
        if (icon.m_Width  >= m_Icons[i].m_Width  ||
            icon.m_Height >= m_Icons[i].m_Height ||
            icon.m_Depth  >= m_Icons[i].m_Depth  ||
            m_Icons[i].m_UrlPath.IsEmpty())
            continue;

        icon = m_Icons[i];
    }

    if (icon.m_UrlPath == "") return "";

    return NormalizeURL(icon.m_UrlPath).ToString();
}
示例#2
0
void parseHTML(WebPage *currpage, List *currlist, HashTable *currhash, int initdepth){
  int position = 0;
  char* newurl;// = NULL;
  char *end;
  //Parsed through the HTML file and gets every URL
  while((position=GetNextURL(currpage->html, position, currpage->url, &newurl)) > 0 ){
    //Normalized the URL and if it is bad, it will be freed right away
    if (NormalizeURL(newurl)){
      //Deals with internal references
      if((end=strchr(newurl,'#'))){
	*end='\0';
      }
      // checks to make sure that the URL has the defined domain
      if( strstr(newurl,URL_PREFIX) != NULL ){
	if ( insertHashTable(currhash,newurl) == 0 ){
	  //Creates a new webpage for the found url and adds it to the our List if it successfully adds to the hashtable
	  char* dummyurl = (char *)malloc(strlen(newurl)+1);
	  strcpy(dummyurl,newurl);
	  WebPage *newpage=createWebPage(dummyurl, initdepth);
	  addtolist(currlist,newpage);
	  free(newurl);
	} else free(newurl);
      } else free(newurl);
    } else free(newurl);
  } 
  free(end);
}
示例#3
0
/***********************************************************************
 * IsInternalURL - see web.h for interface description.
 */
bool IsInternalURL(char *url)
{
  if (NormalizeURL(url)) {
    if (strncmp(url, INTERNAL_URL_PREFIX, strlen(INTERNAL_URL_PREFIX)) == 0)
      return true;
    else 
      return false;
  } else
    return false;
}
示例#4
0
int CheckURL(char* url){
	char dummy[MAXLEN];
	
	if(strlen(url)<strlen(URL_PREFIX)){
		return -1;
	}else{ 
		strcpy(dummy, url);
		dummy[strlen(URL_PREFIX)]='\0';
		if(strcmp(dummy,URL_PREFIX)){
			return -1;
		}
	}
	NormalizeURL(url);
	seedURL = url;
	
	return 0;
}
示例#5
0
void EbookController::OnClickedLink(int pageNo, DrawInstr *link)
{
    ScopedMem<WCHAR> url(str::conv::FromHtmlUtf8(link->str.s, link->str.len));
    if (url::IsAbsolute(url)) {
        EbookTocDest dest(nullptr, url);
        cb->GotoLink(&dest);
        return;
    }

    if (Doc_Epub == doc.Type() && pages && (size_t)pageNo <= pages->Count()) {
        // normalize the URL by combining it with the chapter's base path
        for (int j = pageNo; j > 0; j--) {
            HtmlPage *p = pages->At(j - 1);
            // <pagebreak src="..." page_marker /> is usually the second instruction on a page
            for (size_t k = 0; k < std::min((size_t)2, p->instructions.Count()); k++) {
                DrawInstr& di = p->instructions.At(k);
                if (InstrAnchor == di.type && str::StartsWith(di.str.s + di.str.len, "\" page_marker />")) {
                    ScopedMem<char> basePath(str::DupN(di.str.s, di.str.len));
                    ScopedMem<char> relPath(ResolveHtmlEntities(link->str.s, link->str.len));
                    ScopedMem<char> absPath(NormalizeURL(relPath, basePath));
                    url.Set(str::conv::FromUtf8(absPath));
                    j = 0; // done
                    break;
                }
            }
        }
    }

    int idx = ResolvePageAnchor(url);
    if (-1 == idx && str::FindChar(url, '%')) {
        url::DecodeInPlace(url);
        idx = ResolvePageAnchor(url);
    }
    if (idx != -1) {
        EbookTocDest dest(nullptr, idx);
        cb->GotoLink(&dest);
    }
}
示例#6
0
void CrawlPage(WebPage webpage){
	char* nexturl= NULL;
	int lastpos = 0;
	int depth = webpage.depth + 1;
	
	if(depth > maxWebPageDepth) return;
	
	printf("\n\n[crawler]: Crawling - %s\n\n",webpage.url);
	
	while((lastpos = GetNextURL(webpage.html, lastpos, webpage.url, &nexturl))>0){
		NormalizeURL(nexturl);
		if(!CheckURL(nexturl)){
			// setup new page
			struct WebPage* newwebpage = (WebPage*)calloc(1,sizeof(WebPage));
			newwebpage->url = (char*)calloc(strlen(nexturl)+1, sizeof(char));
			strcpy(newwebpage->url,nexturl);
			newwebpage->depth = depth;
			
			// get new webpage
    		if(GetWebPage(newwebpage)){
    			if(HashTableInsert(nexturl)){											 // If not found in hash table, add to hash table
    				printf("[crawler]: Parser found new link - %s\n",nexturl);
    				struct ListNode* listentry = (ListNode*)calloc(1,sizeof(ListNode));							
    				listentry->page = newwebpage;									     // then add to list
					WebPageList->tail = InsertNode(WebPageList->tail,listentry);
					
	   				WriteFile(*newwebpage, filenum++); 									 // then write file
    			} else{
    				CleanUpPage(newwebpage);
    			}
			}
		}
		free(nexturl);
		nexturl = NULL;
		// Sleep for a second 
		sleep(INTERVAL_PER_FETCH);
	}
}
示例#7
0
文件: crawler.c 项目: pratapl/COSC50
int main(int argc, char* argv[])
{
    
    //Check for the number of arguments
    if(argc != 4){
	    printf("Invalid Input Argument\n");
	    printHelp();
        exit(1);
    } 
   
    //direcotry file path
    int dirSize = strlen(argv[2]);
    char dir[dirSize + 1];
    dir[0] = '\0';
    strcat(dir, argv[2]);

    int urlSize = strlen(argv[1]);
    char inputURL[urlSize + 1];
    inputURL[0] = '\0';
    strcat(inputURL, argv[1]);

    //Get the max depth number.
    int inputDepth = atoi(argv[3]);

    //Check if correct depth is provided.
    if(inputDepth > 4 || inputDepth < 0){
        printf("Invalid [depth]\n");
        printHelp();
        exit(1);
    }
    //Check for URL validity 
    if(!strstr(inputURL,URL_PREFIX)){
 	    printf("Invalid input [seed url]\n");
        printHelp();
	    exit(1);
    }
    //checkf for directory location validity
    DIR* directory = opendir(dir);
    if(directory){
	    closedir(directory);
    }
    else if(ENOENT == errno){
	    printf("Directory does not exist\n");
	    printHelp();
        exit(1);
    }
    else{
	    printf("Directory can't be opened\n");
        printHelp();
	    exit(1);
    }

    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // setup seed page
    WebPage* seedWebPage = calloc(1, sizeof(WebPage));//Memory allocation for seed webpage
    seedWebPage->url = calloc((strlen(inputURL) + 1), sizeof(char));//Memory allocation to the seedURL
    seedWebPage->url[0] = '\0';
    strcat(seedWebPage->url, inputURL);
    seedWebPage->depth = 0;
    seedWebPage->html = NULL;
    
    //Initialize data structures
    HashTable* visitedURLHash = initHashTable();
    List* webPageList = initializeList();
    webPageList->head->page = seedWebPage;  
 
    //get seed webpage.
    if(GetWebPage(seedWebPage)){	
        // write seed file
        FILE *fPointer;
        char* pathVar1 = pathToDirectory(dir, fileNumber);
        fPointer = fopen(pathVar1, "w");
        free(pathVar1);	
        writeHTMLtoFile(fPointer, seedWebPage);
        //free(fPointer);
        
        if(inputDepth == 0){
            curl_global_cleanup();
            free(seedWebPage->html);
            free(seedWebPage->url);
            free(seedWebPage);

            //free webPageList and hashtable
            free(webPageList);
            for(int i = 0; i < MAX_HASH_SLOT; i++){
                free(visitedURLHash->table[i]->url);
                free(visitedURLHash->table[i]);
            }
            free(visitedURLHash);
            return 0;
        }   
        fileNumber += 1;
        depth += 1;
        HashTableInsert(visitedURLHash, seedWebPage->url); //mark as visited
        
        // extract urls from seed page
        char * result;
        int pos = 0;
        while((pos = GetNextURL(seedWebPage->html, pos, seedWebPage->url, &result))>0){

            if(NormalizeURL(result) && strstr(result,URL_PREFIX)){
                strtok(result, "#");
                //If not in hashtable, add it to the hashtable and add it to the webPageList.
                if(HashTableLookup(visitedURLHash, result) == 0){
                    HashTableInsert(visitedURLHash, result);
                    AppendList(webPageList, webPageInit(result, depth));
                    free(result);
                }
            }
        }
        if(webPageList->head->next->next == NULL){  //seed redirect case
            webPageList->head->next->page->depth = 0;
            fileNumber = 1;		
        }
        tempWebPage = PopList(webPageList); // Get rid of visited seedPage
    }
    else{	
        curl_global_cleanup();
        tempWebPage = PopList(webPageList);
        free(seedWebPage->html);
        free(seedWebPage->url);
        free(seedWebPage);
        //free(tempWebPage);
        free(webPageList);
        for(int i = 0; i < MAX_HASH_SLOT; i++){
            free(visitedURLHash->table[i]->url);
            free(visitedURLHash->table[i]);
        }
        free(visitedURLHash);
        exit(1);
    }

    
    //while there are urls to crawl
    while(webPageList->head != NULL && webPageList->tail != NULL){
        // get webpage for url
        tempWebPage = PopList(webPageList);
        if(GetWebPage(tempWebPage)){ 
            // write page file
            char* pathVar = pathToDirectory(dir, fileNumber);
            FILE *fPointer = fopen(pathVar, "w");
            free(pathVar);
            printf("Found link: %s\n",tempWebPage->url);
            writeHTMLtoFile(fPointer, tempWebPage);
            fileNumber += 1;
                
            if((tempWebPage->depth + 1) <= inputDepth ){
                char * resultTemp;
                int posTemp = 0;
                while((posTemp = GetNextURL(tempWebPage->html, posTemp, tempWebPage->url, &resultTemp))>0){
                    
                    if( NormalizeURL(resultTemp) && strstr(resultTemp,URL_PREFIX) ){
                        strtok(resultTemp, "#");
                        //insert to the hashtable and the webPageList if not already present
                        if(HashTableLookup(visitedURLHash, resultTemp) == 0){
                            HashTableInsert(visitedURLHash, resultTemp);
                            AppendList(webPageList, webPageInit(resultTemp, tempWebPage->depth+1));
                        }
                    }
                        free(resultTemp);
                }
            }
        
            free(tempWebPage->url);
            free(tempWebPage->html);
            free(tempWebPage);
        }
        else{
            free(tempWebPage->url);
            free(tempWebPage->html);
            free(tempWebPage);
        }
        sleep(INTERVAL_PER_FETCH);
    }
    // cleanup curl
    curl_global_cleanup();
    free(seedWebPage->url);
    free(seedWebPage->html);
    free(seedWebPage);
    free(webPageList);

    //free the hashtable
    for(int i = 0; i < MAX_HASH_SLOT; i++){
        if(visitedURLHash->table[i]->url != NULL){
            HashTableNode* currNode = visitedURLHash->table[i];
            while(currNode->next != NULL){
                HashTableNode* tempNode = currNode;
                currNode = currNode->next;
                free(tempNode->url);
                free(tempNode);
            }
            free(currNode->url);
            free(currNode);		
        }
        else{	
            free(visitedURLHash->table[i]);
        }
    }
    free(visitedURLHash);
    return 0;
}
示例#8
0
int main(int argc, char* argv[])
{
    // check command line arguments
	
	// Check that there are three input parameters.
	if (argc != 4) {
		printf("Please input three parameters: seed URL, directory, and max depth.\n");
		return 1;
	}
	
	// Check that the seed url has proper domain (old-www).
	if (strncmp(argv[1], URL_PREFIX, 15) != 0) {
		printf("The seed URL domain must be old-www.\n");
		return 1;
	}
	
	// Check that the directory already exists.
	struct stat st;
	if (stat(argv[2], &st) == 0 && S_ISDIR(st.st_mode));
	else { // If the directory does not exist, terminate the program.
		printf("The directory %s cannot be found. Please enter an existing directory.\n", argv[2]);
		return 1;
	}
	
	// Check that the directory path does not have a '/' at the end for ease in writing filenames.
	if (argv[2][strlen(argv[2]) - 1] == '/') {
		printf("Please do not add '/' at the end of the directory path.\n");
		return 1;
	}
	
	// Check the third argument.
	// Loop through each letter of the first argument and check that it is indeed a number.
	for (int i = 0; i < strlen(argv[3]); i++) {
		if (!isdigit((int)argv[3][i])) {
 			printf("Please input a valid number for the depth.\n");
   			return 1;
   		}
	}
   	
  	sscanf(argv[3], "%d", &depth); // Store the argument as an integer.
	
	// Check that the depth specified does not exceed max depth.
	if (depth > MAX) {
		printf("Search depth cannot exceed MAX depth of 4. Please enter a valid depth.\n");
		return 1;
	}
	
    	// init curl
    	curl_global_init(CURL_GLOBAL_ALL);

    	// setup seed page
	WebPage *page = calloc(1, sizeof(WebPage));
	page->url = (char *)malloc(strlen(argv[1]) + 1);
	MALLOC_CHECK(stderr, page->url); // Check that memory was allocated.
	strcpy(page->url, argv[1]); // Copy the seed url to page->url.

    	// get seed webpage
	if (!GetWebPage(page)) {
		printf("The seed URL is invald. Please enter a valid seed URL.\n");
		FreeWebMemory(page);
		return 1;
	}
	
	// Normalize the seed url.
	if (!NormalizeURL(page->url)) {
		printf("Seed URL cannot be normalized.\n");
		FreeWebMemory(page);
		return 1;
	}
	
    	// write seed file
    	strcpy(path, argv[2]); // Let var path contain the directory path.
    	WriteFile(page, path, pageID);
	
    	// add seed page to hashtable
    	if (!AddToHashTable(page->url)) {
		FreeWebMemory(page);
    		return 1;
   	 }
	
	// Initialize URLList.
	if (!InitList()) {
		FreeWebMemory(page);
		return 1;
	}
	
    	// extract urls from seed page
    	if (!CrawlPage(page)) {
    		FreeHashTable(); // Free all memory dynamically allocated to the hash table.
		FreeWebMemory(page);
    		return 1;
    	}

	// while there are urls to crawl
    	while (URLList.tail != NULL) {

    		// get next webpage from list
    		WebPage *next = PopList();

        	// write page file
        	pageID++;
		if (!WriteFile(next, argv[2], pageID)) { // Check that the WriteFile worked.
			FreeWebMemory(next);
			return 1;
		}

        	// extract urls from webpage and then cleanup.
    		CrawlPage(next);
		FreeWebMemory(next);
    	}
    	// Memory cleanup.
	FreeHashTable(); // Free memory dynamically allocated to the hash table.
	FreeWebMemory(page); // Free memory dynamically allocated to the Webpage variable.
	
    	// cleanup curl
    	curl_global_cleanup();
    
    	return 0;
}
示例#9
0
// Function to crawl a given webpage for links.
int CrawlPage(WebPage *wp) {
	
	char *result; // variable to hold the url.
    	int pos = 0; // position in each html page.
    	WebPage *newPage; // New webpage.
    
    	// Check that the depth does not exceed the depth passed.
    	if (wp->depth >= depth) {
    		return 0;
    	}
    
    	printf("\n");
    	printf("[crawler]: Crawling - %s\n", wp->url); // Print the url being curled.
    	printf("\n");
    
    	// Loop through each html page to get all its urls.
    	while ((pos = GetNextURL(wp->html, pos, wp->url, &result)) >= 0) {
    
    		// Check that the url has proper domain (old-www).
		if (strncmp(result, URL_PREFIX, strlen(URL_PREFIX)) != 0) {
			free(result);
			continue;
		}
 		
		// Normalize the url.
    		if (!NormalizeURL(result)) {
    			free(result);
    			continue;
    		}
    	
    		// Check that the url isn't already in the hash table.
    		if (!InHashTable(result)) {
    			AddToHashTable(result); // Add the url to the hash table.
    		
    			// Setup new page for each url.
			newPage = calloc(1, sizeof(WebPage));
			newPage->depth = wp->depth + 1;
			newPage->url = (char *)malloc(strlen(result) + 1);
			if (!newPage->url) { // Check that memory was allocated.
				continue;
			}
			strcpy(newPage->url, result);

			// Get html for each url.
			if (!GetWebPage(newPage)) {
				FreeWebMemory(newPage);
				free(result);
				continue;
			}
			
			printf("[crawler]: Parser found link - %s\n", result);
			
			// Add to the list of webpages to be visited.
			if (!AppendList(newPage)) {
				free(result);
				return 0;
			}
    		}
    		free(result);
    	}
	return 1;
}
// main crawler function
int main(int argc, char* argv[]) {

    // local variables
    FILE *fp; // file pointer for html files
    char *nextURL; // pointer to the next URL found on the seed page
    char *newURL; // pointer to the next URL in the while loop

    // check command line arguments
    if (argc != 4) {
        printf("Incorrect number of arguments provided.");
        exit(1);
    }
    // check that the second argument is a directory
    stat(argv[2],&statbuffer);
    if S_ISDIR(statbuffer.st_mode) { }
    else {
        printf("Error, you did not supply a valid directory");
        exit(1);
    }

    // get arguments
    char *seedURL = argv[1];
    int filename_len = strlen(argv[2])+21;

    // get the directory
    char*filename = calloc(filename_len,sizeof(char));

    // check the maxDepth
    int value = is_numeric(argv[3]);
    if (value != 0) {
        sscanf(argv[3],"%i",&maxDepth);
    }
    else {
        printf("Error! maxDepth must be a number");
        exit(1);
    }

    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // initialize data structures/variables

    // initialize hashtable
    HashTable *table = malloc(sizeof(HashTable));
    memset(table,0,MAX_HASH_SLOT);

    // initialize linked list
    List *WebPageList;
    WebPageList = createList();

    // setup seed page

    // get seed webpage
    // if it fails, report and exit
    if (NormalizeURL(seedURL) == 0) {
        printf("Error, bad URL");
        exit(1);
    }
    // write seed file

    // create WebPage object by allocating memory
    WebPage *seedPage = malloc(sizeof(WebPage));

    // assign values to each part of the struct
    seedPage->url = seedURL;
    seedPage->html = NULL;
    seedPage->html_len = 0;
    seedPage->depth = 0;

    // try to get the webpage up to MAX_TRY times
    if (!GetWebPage(seedPage)) {
        for (tries = 0; tries < MAX_TRY; tries++) {
            if (GetWebPage(seedPage)) {
                break;
            }
        }
    }

    // write html contents to a file "1" in the given directory
    sprintf(filename,"%s/%d",argv[2],1);
    fp = fopen(filename,"w");
    fputs(seedURL,fp);
    fputs("\n",fp);
    fprintf(fp,"%d\n",seedPage->depth);
    fputs(seedPage->html,fp);

    // close the file and wipe the filename
    fclose(fp);
    memset(filename,'\0',filename_len);

    // add seed page to hashtable
    add(table,seedURL);

    // extract urls from seed page

    // while there are still URLs in the seed page's html
    while ((pos = GetNextURL(seedPage->html,pos,seedPage->url,&nextURL)) > 0) {

        // only visiting them if it wouldn't exceed maxDepth
        if ((seedPage->depth+1) > maxDepth) {
            free(seedPage);
            exit(1);
        }

        // ensure it's a valid url
        if (NormalizeURL(nextURL) != 0) {

            // also check if its in the right domain
            if (strncmp(URL_PREFIX,nextURL,strlen(URL_PREFIX)) == 0) {

                // if it is added to the hashtable it is a unique URL that
                // hasn't been visited before, add it to the linked list
                // of URLs to visit
                if (add(table,nextURL)) {
                    // create a new webpage object
                    WebPage *pages = malloc(sizeof(WebPage));
                    pages->url = nextURL;
                    pages->html = NULL;
                    pages->html_len = 0;
                    pages->depth = 1;

                    // try to get the webpage up until the MAX_TRY
                    tries = 0;
                    if (!GetWebPage(pages)) {
                        for (tries = 0; tries < MAX_TRY; tries++) {
                            if (GetWebPage(pages)) {
                                break;
                            }
                        }
                    }

                    // add it to the linked list
                    addToEnd(WebPageList,pages);
                }
            }
        }
    }

    // while there are urls to crawl
    while (WebPageList->head != NULL) {
        // get next url from list
        WebPage *nextPage = malloc(sizeof(WebPage));
        nextPage = removeFromFront(WebPageList);

        // try to get the webpage up until the MAX_TRY
        tries = 0;
        if (!GetWebPage(nextPage)) {
            for (tries = 0; tries < MAX_TRY; tries++) {
                if (GetWebPage(nextPage)) {
                    break;
                }
            }
        }

        // write page file
        sprintf(filename,"%s/%d",argv[2],docNum);
        fp = fopen(filename,"w");
        fputs(nextPage->url,fp);
        fputs("\n",fp);
        fprintf(fp,"%d\n",nextPage->depth);
        fputs(nextPage->html,fp);

        // close the file and wipe the filename (to be used next time)
        fclose(fp);
        memset(filename,'\0',filename_len);

        // increment the doc num
        docNum++;

        // check if visiting the URLs on this page will exceed maxDepth
        if ((nextPage->depth+1) > maxDepth) {
            free(nextPage);
            continue;
        }
        pos = 0;
        // iterate through all the URLs on the page
        while ((pos = GetNextURL(nextPage->html,pos,nextPage->url,&newURL))>0) {
            // check to ensure that the URLs are the proper format
            if (NormalizeURL(newURL) != 0 ) {
                // check to ensure that they are in the right domain
                if (strncmp(URL_PREFIX,newURL,strlen(URL_PREFIX)) == 0) {
                    // making sure to only add new ones to the list
                    if (add(table,newURL) != 0) {
                        // create a new WebPage object
                        WebPage *page = malloc(sizeof(WebPage));
                        page->url = newURL;
                        page->html = NULL;
                        page->html_len = 0;
                        page->depth = nextPage->depth + 1;
                        GetWebPage(page);

                        // try to get the webpage up until the MAX_TRY
                        tries = 0;
                        if (!GetWebPage(page)) {
                            for (tries = 0; tries < MAX_TRY; tries++) {
                                if (GetWebPage(page)) {
                                    break;
                                }
                            }
                        }

                        // add the page to the linked list
                        addToEnd(WebPageList,page);
                    }
                }
            }
        }
        // Sleep for a bit to avoid annoying the target
        sleep(INTERVAL_PER_FETCH);

        // Free resources
        free(nextPage);

    }

    // cleanup curl
    curl_global_cleanup();

    // free resources
    // free hashtable
    hash = JenkinsHash(seedURL,MAX_HASH_SLOT);
    HashTableNode *freer = table->table[hash];
    HashTableNode *tempHash = NULL;
    while (freer != NULL) {
        tempHash = freer;
        freer = freer->next;
        free(tempHash);
    }
    free(table);

    // free linked list
    free(WebPageList);

    // free WebPage and filename pointer
    free(seedPage);
    free(filename);
    return 0;
}
示例#11
0
int main(int argc, char* argv[])
{	
	filenum = 1;
	
    // check command line arguments
	if(argc != 4){		// check number of arguments
		fprintf(stderr,"Error: Number of input argument needs to be exactly 3\n");
		return -1;
	}else if (CheckDirectory(argv[2])){		// check if directory exist
		return -1;
	}else if(CheckDepth(argv[3])){			// check depth
		return -1;
	}else if(CheckURL(argv[1])){			// check url
		fprintf(stderr,"Error: Invalid URL. Can only crawl URL with URL prefix %s\n",URL_PREFIX);
		return -1;
	}
	
    // init curl
    curl_global_init(CURL_GLOBAL_ALL);

    // setup seed page
	seedPage = (WebPage*)calloc(1,sizeof(WebPage));
	NormalizeURL(seedURL);
    seedPage->url = (char*)calloc(strlen(seedURL)+1,sizeof(char));
	strcpy(seedPage->url,seedURL);
	seedPage->depth = 0;
	
    // get seed webpage
    if(!GetWebPage(seedPage)){				// clean up and exit if url is invalid
    	fprintf(stderr,"Error: Invalid URL\n");
		curl_global_cleanup();
		return -1;
	}

    // write seed file
	WriteFile(*seedPage, filenum++);
	
	// Exit if maxWebPageDepth = 0
	if(maxWebPageDepth == 0){				// clean up and exit if max webpage depth is 0
		printf("\n[crawler]: Crawling - %s\n\n",seedPage->url);
		printf("1 page has been crawled \n\n");
		CleanUpPage(seedPage);	
		CleanUpHash(URLsVisited);	
		curl_global_cleanup();
		return 0;
	}	
	
    // add seed page to hashtable
   	InitialiseHashTable(URLsVisited);
    HashTableInsert(seedURL);

    // add seed node to list
    WebPageList = (List*)calloc(1,sizeof(List));
    struct ListNode* seednode = (ListNode*)calloc(1,sizeof(ListNode));
    seednode->page = seedPage;
    WebPageList->head = seednode;
    WebPageList->tail = seednode;
    
    // extract urls from seed page
    CrawlPage(*seedPage);
    WebPageList->head = RemoveNode(WebPageList->head);
    
    // while there are urls to crawl
   	while(WebPageList->head != NULL && WebPageList->head->page->depth < maxWebPageDepth){
        // get next url from list, get webpage for url, write page file and extract urls from webpage
		CrawlPage(*(WebPageList->head->page));
		WebPageList->head = RemoveNode(WebPageList->head);
	}
	
	// cleanup memory
	CleanUpList(WebPageList);
	CleanUpHash(URLsVisited);

    // cleanup curl
    curl_global_cleanup();
	
	printf("\n\n %d webpages have been crawled\n\n", filenum-1);
    return 0;
}
示例#12
0
文件: css.c 项目: ArcScofield/Amaya
/*----------------------------------------------------------------------
  AttrMediaChanged: the user has created removed or modified a Media
  attribute
  ----------------------------------------------------------------------*/
void AttrMediaChanged (NotifyAttribute *event)
{
  ElementType         elType;
  Element             el;
  Document            doc;
  Attribute           attr;
  AttributeType       attrType;
  CSSInfoPtr          css;
  CSSmedia            media;
  PInfoPtr            pInfo;
  DisplayMode         dispMode;
  char                completeURL[MAX_LENGTH];
  char                tempname[MAX_LENGTH];
  char               *name2;
  int                 length;

  el = event->element;
  doc = event->document;
  attr = event->attribute;
  elType = TtaGetElementType (el);
  /* get the new media value */
  length = TtaGetTextAttributeLength (attr);
  name2 = (char *)TtaGetMemory (length + 1);
  TtaGiveTextAttributeValue (attr, name2, &length);
  media = CheckMediaCSS (name2);
  TtaFreeMemory (name2);
  /* get the CSS URI */
  attrType.AttrSSchema = elType.ElSSchema;
  attrType.AttrTypeNum = HTML_ATTR_HREF_;
  attr = TtaGetAttribute (el, attrType);
  if (attr &&
      /* don't manage a document used by make book */
      (DocumentMeta[doc] == NULL ||
       DocumentMeta[doc]->method != CE_MAKEBOOK))
    {
      length = TtaGetTextAttributeLength (attr);
      name2 = (char *)TtaGetMemory (length + 1);
      TtaGiveTextAttributeValue (attr, name2, &length);
      /* load the stylesheet file found here ! */
      NormalizeURL (name2, doc, completeURL, tempname, NULL);
      TtaFreeMemory (name2);
      /* get the right CSS context */ 
      css = SearchCSS (doc, completeURL, el, &pInfo);
    }
  else
    /* get the right CSS context */ 
    css = SearchCSS (doc, NULL, el, &pInfo);
  if (css && pInfo)
    {
      /* avoid too many redisplay */
      dispMode = TtaGetDisplayMode (doc);
      /* something changed and we are not printing */
      if (media == CSS_ALL || media == CSS_SCREEN)
        {
          if (dispMode != NoComputedDisplay)
            TtaSetDisplayMode (doc, NoComputedDisplay);
          LoadStyleSheet (completeURL, doc, el, NULL, NULL, media,
                          pInfo->PiCategory == CSS_USER_STYLE);
          /* restore the display mode */
          if (dispMode != NoComputedDisplay)
            TtaSetDisplayMode (doc, dispMode);
        }
      else
        {
          if (media == CSS_PRINT || media == CSS_OTHER)
            {
              if (dispMode != NoComputedDisplay)
                TtaSetDisplayMode (doc, NoComputedDisplay);
              UnlinkCSS (css, doc, el, TRUE, FALSE, TRUE);
              /* restore the display mode */
              if (dispMode != NoComputedDisplay)
                TtaSetDisplayMode (doc, dispMode);
            }
          /* only update the CSS media info */
          pInfo->PiMedia = media;
        }
    }
}