C++ (Cpp) parseHTML примеры использования

Пример #1

0

Показать файл

Файл: main.c Проект: galexcode/cheerio

int main(int argc, char *argv[]) {
	FILE *fh = fopen(argv[1], "r");
	int fsz = 0;
	char *data;
	fseek(fh, 0L, SEEK_END);
	fsz = ftell(fh);
	data = (char *)malloc(sizeof(char)*fsz+1);
	fseek(fh, 0L, SEEK_SET);
	fread(data, fsz, 1, fh);
	data[fsz] = '\0';
	fclose(fh);
	page *dom = parseHTML(data);
	dom->curr = dom->il;
	for(;dom->curr != NULL; dom->curr = dom->curr->nitem) {
		if(dom->curr->type >= 0 && dom->curr->type < elements) {
			printf("%s\n", element[dom->curr->type]);
			if(dom->curr->a != NULL) {
				if(dom->curr->a->innerhtml != NULL)
					printf("\tINNERHTML:%s\n", dom->curr->a->innerhtml);
				if(dom->curr->a->innerhtml != NULL)
					printf("\tCLASS:%s\n", dom->curr->a->class);
			}
			printf("\n");
		}
	}

Пример #2

0

Показать файл

Файл: tget.c Проект: hahnakane/junkcode

static int readFile(rpmtget tget)
{
    int rc;
    int xx;

    xx = tgetInit(tget, 8 * BUFSIZ);

    xx = rpmswEnter(tget->gop, 0);

    if (S_ISDIR(tget->sb.st_mode)) {
	rc = tgetFill(tget);
    } else
    if (S_ISREG(tget->sb.st_mode)) {
	rc = tgetFill(tget);
    } else
	rc = -1;

    xx = rpmswExit(tget->gop, tget->nbuf);

    if (rc < 0)
	goto exit;

    rc = parseHTML(tget);

exit:
    xx = tgetFini(tget);
    return rc;
}

Пример #3

0

Показать файл

Файл: Browser.cpp Проект: brewercg/CPSC2430

void Browser::loadHTML(string filename)
{
  string str;
  string allText = "";
  ifstream infile;
  infile.open(filename.c_str());
  if( !infile.is_open() )
	{
	  cout << "ERROR Loading HTML file!\n";
	  return;
	}
  while(!infile.eof()) // To get you all the lines.
	{
	  getline(infile,str); // Saves the line in STRING.
	  allText += str + "\n";
	}
  infile.close();

  // add the current page as the 0-th link
  links[0] = filename;

  // ensure pages is initialized in case parse fails
  pages[0] = ""; 
  
  parseHTML(allText);

  displayText = pages[0];
}

Пример #4

0

Показать файл

Файл: Page.cpp Проект: aecio/RI-TP1

Page::Page(string url, string httpResponse){
    url_ = url;
//		const char* text = httpResponse.c_str();
//		if( HTML::detect_utf8(text, httpResponse.size()) == false ){
//			CharsetConverter cc("8859_1", "UTF8");
//			httpResponse = string(cc.convert(text));
//		}
    parseHTML(HTML::single_blank(HTML::decode_entities(httpResponse)));
}

Пример #5

0

Показать файл

Файл: htmlparser.cpp Проект: ramonelalto/gambas

void GBparseHTML(const char *data, const size_t lendata, GB_ARRAY *array)
{
    size_t nodeCount;
    size_t i = 0;
    Node **nodes = parseHTML(data, lendata, &nodeCount);
    GB.Array.New(array, GB.FindClass("XmlNode"), nodeCount);

    for(i = 0; i < nodeCount; ++i)
    {
        *(reinterpret_cast<void **>((GB.Array.Get(*array, i)))) =  XML.XMLNode_GetGBObject(nodes[i]);
        GB.Ref(nodes[i]->GBObject);
    }

    free(nodes);
}

Пример #6

0

Показать файл

Файл: jni.cpp Проект: kailu/java-lxml

JNIEXPORT jobjectArray JNICALL Java_com_ireeed_XPathApplier_htmlEvalXPaths(JNIEnv *env, jobject jobj, jstring page, jobjectArray xpaths){
    jboolean isCopy;
    const char *data = env->GetStringUTFChars(page,&isCopy);

    xmlDocPtr doc = parseHTML(data,strlen(data));
    //inform jvm to release the allocated data, no matter copied is true or false
    env->ReleaseStringUTFChars(page,data);

    xmlXPathObjectPtr result;
    xmlNodeSetPtr nodeset;
    std::vector<std::vector<std::string> > ret;

    jsize xpathNum = env->GetArrayLength(xpaths);


    if(doc){
	for(int index = 0; index < xpathNum; index++){

	    jstring string = (jstring)env->GetObjectArrayElement(xpaths,index);

	    const char* xpath = env->GetStringUTFChars(string, &isCopy);

	    std::vector<std::string> output;
	    if(xpath){
		result = getNodeSet(doc,xpath);
		if(result){
		    nodeset = result->nodesetval;
		    for(int i = 0; i < nodeset->nodeNr; i++){
			xmlNodePtr nodePtr = nodeset->nodeTab[i];
			std::string record;
			getTextInNode(nodePtr,record);
		
			output.push_back(record);
		    }
		    xmlXPathFreeObject(result);
		}
		//inform jvm to release the allocated data, no matter copied is true or false
		env->ReleaseStringUTFChars(string,xpath);
	    }
	    ret.push_back(output);

	}
    
	xmlFreeDoc(doc);
    }


    /*
      construct std::vector<std::vector<std::string> > back to java type(array of array)
      construct the final result in order according to the order of given xpaths
    */
    jclass stringArrayCls = env->FindClass("java/lang/Object");
    jobjectArray elemObj = env->NewObjectArray(0,env->FindClass("java/lang/String"),env->NewStringUTF(""));

    jobjectArray retVal = env->NewObjectArray(ret.size(),stringArrayCls,elemObj);
    for(int i = 0; i < ret.size(); i++){
	std::vector<std::string> &elem = ret[i];
	elemObj = env->NewObjectArray(elem.size(),env->FindClass("java/lang/String"),env->NewStringUTF(""));
	for(int j = 0; j < elem.size(); j++){
	    std::string &elemField = elem[j];
	    //create java string
	    env->SetObjectArrayElement(elemObj,j,env->NewStringUTF(elemField.c_str()));
	}
	env->SetObjectArrayElement(retVal,i,elemObj);
    }

    return retVal;  
}

Пример #7

0

Показать файл

Файл: htmltable.c Проект: Chaduke/bah.mod

/* make_html_label:
 * Return non-zero if problem parsing HTML. In this case, use object name.
 */
int make_html_label(graph_t *g, textlabel_t * lp, void *obj)
{
    int rv;
    int wd2, ht2;
    box box;
    htmllabel_t *lbl;
    htmlenv_t env;

    env.obj = obj;
    switch (agobjkind(obj)) {
    case AGGRAPH:
	env.g = ((Agraph_t *) obj)->root;
	break;
    case AGNODE:
	env.g = ((Agnode_t *) obj)->graph;
	break;
    case AGEDGE:
	env.g = ((Agedge_t *) obj)->head->graph;
	break;
    }
    env.finfo.size = lp->fontsize;
    env.finfo.name = lp->fontname;
    env.finfo.color = lp->fontcolor;

    lbl = parseHTML(lp->text, &rv, GD_charset(env.g));
    if (!lbl) {
	/* Parse of label failed; revert to simple text label */
	agxbuf xb;
	unsigned char buf[SMALLBUF];
	agxbinit(&xb, SMALLBUF, buf);
	lp->html = FALSE;
	lp->text = strdup(nameOf(obj, &xb));
	size_label(env.g, lp);
	agxbfree(&xb);
	return rv;
    }

    if (lbl->kind == HTML_TBL) {
	lbl->u.tbl->data.pencolor = getPenColor(obj);
	rv |= size_html_tbl(g, lbl->u.tbl, NULL, &env);
	wd2 = (lbl->u.tbl->data.box.UR.x + 1) / 2;
	ht2 = (lbl->u.tbl->data.box.UR.y + 1) / 2;
	box = boxof(-wd2, -ht2, wd2, ht2);
	pos_html_tbl(lbl->u.tbl, box, BOTTOM | RIGHT | TOP | LEFT);
	lp->dimen.x = box.UR.x - box.LL.x;
	lp->dimen.y = box.UR.y - box.LL.y;
    } else {
	rv |= size_html_txt(g, lbl->u.txt, &env);
	wd2 = (lbl->u.txt->box.UR.x + 1) / 2;
	ht2 = (lbl->u.txt->box.UR.y + 1) / 2;
	box = boxof(-wd2, -ht2, wd2, ht2);
	lbl->u.txt->box = box;
	lp->dimen.x = box.UR.x - box.LL.x;
	lp->dimen.y = box.UR.y - box.LL.y;
    }

    lp->u.html = lbl;

    /* If the label is a table, replace label text because this may
     * be used for the title and alt fields in image maps.
     */
    if (lbl->kind == HTML_TBL) {
	free (lp->text);
	lp->text = strdup ("<TABLE>");
    }

    return rv;
}

Пример #8

0

Показать файл

Файл: crawler.c Проект: GrantAmann/TinySearchEngine

/* ========================================================================== */
int main(int argc, char* argv[]) {
  int filenum=1;
  int initdepth=1;
  // check command line arguments
  if (argcheck(argc,argv) == 1){
    exit(1);
  }

  char *starturl = argv[1];
  char *targetdir = argv[2];
  int depth = atoi(argv[3]);
  
  //initialize our hashtables and url list
  HashTable *myhashtable;
  List *mylist;
  myhashtable=initializeHashTable();
  mylist=initializelist();
  
  // init curl
  curl_global_init(CURL_GLOBAL_ALL);
  
  // setup seed page
  WebPage *startpage = createWebPage(starturl,0);

  // get seed webpage.  If the url is invalid, quit and send an error message.
  if ( GetWebPage(startpage) == 0 ){
    printf("The url that you entered was invalid.  Please try again.");
    free(startpage->html);
    free(startpage);
    exit(1);
  }
    
  // write seed file
  createfile(startpage,targetdir,filenum);
  filenum++;
  
  // add seed page to hashtable
  insertHashTable(myhashtable,startpage->url);

  // extract urls from seed page
  if( depth > 0 ){
    parseHTML(startpage,mylist,myhashtable,initdepth);
   }

  // while there are urls to crawl
  while ( mylist->head != NULL ){
    // get next url from list
    WebPage *nextpage = listpop(mylist);
    int currdepth = nextpage->depth;
 
   // get webpage for url
    // If the url is invalid, quit and free the memory
    if (GetWebPage(nextpage) != 0 ){
      createfile(nextpage,targetdir,filenum);
      filenum++;
      // extract urls from webpage
      if ( currdepth < depth ){
	parseHTML(nextpage, mylist, myhashtable, currdepth+1);
      }
    }
    free(nextpage->html);
    free(nextpage->url);
    free(nextpage);
    sleep(SLEEPTIME);
  }
  // cleanup curl
  free(startpage->html);
  free(startpage);
  freeHashTable(myhashtable);
  freelist(mylist);
  curl_global_cleanup();
  return 0;
}

Пример #9

0

Показать файл

Файл: HTMLSyntax.cpp Проект: sanyaade-webdev/core-Components

void VHTMLSyntax::GetSuggestions( ICodeEditorDocument* inDocument, sLONG inLineNumber, sLONG inPos, ITipInfoArray *outSuggestions, sLONG& outStartOffset, bool inAll )
{
	// Get the text for the line up to the point of insertion, and we'll lex that to see if we can come up 
	// with some rational suggestions for the user.
	VString xstr;
	inDocument->GetLine( inLineNumber, xstr );
	xstr.Truncate( inPos );

	char *lexinput = CreateUTF8String( xstr );
	struct htmlLexeme *list = parseHTML( lexinput );

	// Gin up some line params for tracking state information
	VLineSyntaxParams *currentLineParams = currentLineParams = new VLineSyntaxParams();
	if (inLineNumber > 0) {
		// We're starting where we left off on the previous line
		currentLineParams->CopyState( static_cast< VLineSyntaxParams * >( inDocument->GetLineSyntaxParams( inLineNumber - 1 ) ) );
	}

	// Given the list of HTML tokens, let's walk over the list and try to make some sense
	// of them.  Walk over the list one token at a time, and see if we can make sense of 
	// what we've got.  This is going to be awfully similar to the way we do things in the
	// SetLine method, except that we're not actually updating the line state for the current
	// line.  Instead, we're working on a copy of the existing information.
	struct htmlLexeme *cur = list;
	int lastTokenProcessed = 0;
	while (cur) {
		if (kKeyword == cur->fStyle) {
			lastTokenProcessed = 3;

			// Keywords a bit trickier than you might think because we need to be sure they're actually part of a
			// tag.  If the user types something like: <b>This table rocks</b>, we only want to highlight the b in the
			// begin and end tag, and not the "table" in the user's text.  To deal with this, we have an "in tag" flag
			// that basically turns keyword highlighting on and off.
			if (currentLineParams->IsProcessingTag()) {
				// If we're processing an opening tag, then we want to push the keyword onto the tag stack.  But if we're
				// processing a closing tag, then we want to pop the last keyword off the tag stack and try to match it up
				// to what we just processed.  If they match, we're golden.  If not, we just assume the user's mismatching
				// their tags because they're an idiot.
				VString tagName;
				xstr.GetSubString( cur->fOffset + 1, cur->fLength, tagName );

				if (currentLineParams->IsProcessingStartTag()) {
					currentLineParams->PushTag( tagName );

					// Note that we are no longer processing the start of a tag.  This allows us to handle attributes
					// separately from the tag itself.
					currentLineParams->SetIsProcessingStartTag( false );
				} else {
					VString lastTag;
					currentLineParams->PopTag( lastTag );

					if (!lastTag.EqualTo( tagName, false )) {
						// The tags don't match, so we're just going to ignore the issue
						// TODO: do something more sensible here
					}
				}
			}
		} else if (kTagOpen == cur->fStyle || kEndTagOpen == cur->fStyle) {
			lastTokenProcessed = (kTagOpen == cur->fStyle) ? 1 : 2;

			currentLineParams->SetIsProcessingTag( true );
			currentLineParams->SetIsProcessingStartTag( kTagOpen == cur->fStyle );
		} else if (kTagClose == cur->fStyle || kTagSelfClose == cur->fStyle) {
			lastTokenProcessed = 0;

			currentLineParams->SetIsProcessingTag( false );

			// If we just handled a self-closing tag (like <br />), then we want to pop it from the stack
			// TODO: some tags can't have matching pairs, like <br>, so even if it's not self-closing, we want
			// to pop it off the tag stack.  Handle that here
			if (kTagSelfClose == cur->fStyle) {
				VString toss;
				currentLineParams->PopTag( toss );
			}
		} else {
			lastTokenProcessed = 0;
		}

		cur = cur->fNext;
	}

	if (lastTokenProcessed == 1) {
		// We processed a tag opener, but no keyword for the tag.  So let's make a bunch of suggestions!
	} else if (lastTokenProcessed == 2) {
		// We processed a tag closer, but no keyword for the tag.  Grab the last opened tag from the list
		// and suggest it as the closer
		VString suggestion;
		currentLineParams->LastTag( suggestion );
		outSuggestions->AddTip( new VCodeEditorTipInfo( inDocument, suggestion, htmlcolorShadow[ keyword_col ] ) );
	}

	delete currentLineParams;
	FreeLexemeList( list );
}

Пример #10

0

Показать файл

Файл: HTMLSyntax.cpp Проект: sanyaade-webdev/core-Components

void VHTMLSyntax::SetLine( ICodeEditorDocument *inDocument, sLONG inLineNumber, bool inLoading )
{
#if 0
	VString source;
	inDocument->GetLine( inLineNumber, source );
	
	HTMLParser parser;
	HTMLParser::State *state = NULL;
	HTMLParser::State *prevLineState = NULL;
	if (inLineNumber > 0)	prevLineState = GetStateForLine( inDocument, inLineNumber - 1 );
	ParsingCookie *cookie = new ParsingCookie( inDocument, inLineNumber );
	parser.Parse( source, prevLineState, &state, this, (const void *)cookie );
	SetStateForLine( inDocument, inLineNumber, state );
	cookie->Release();
#else
	// Get the params for the current line so that we can set them up properly
	VLineSyntaxParams *currentLineParams = static_cast< VLineSyntaxParams * >( inDocument->GetLineSyntaxParams( inLineNumber ) );
	if (!currentLineParams) {
		currentLineParams = new VLineSyntaxParams();
		inDocument->AssignLineSyntaxParams( inLineNumber, currentLineParams );
	}
	bool previousOpenCommentState = currentLineParams->IsOpenComment();

	// We also want the params for the preceeding line, in case we're the continuation of
	// a comment.
	VLineSyntaxParams *previousLineParams = NULL;
	if (inLineNumber > 0) {
		previousLineParams = static_cast< VLineSyntaxParams * >( inDocument->GetLineSyntaxParams( inLineNumber - 1 ) );
	}

	VString xstr;
	inDocument->GetLine(inLineNumber,xstr);
	inDocument->SetLineStyle(inLineNumber,0,xstr.GetLength(),0);		//initiate the line

	char *lexinput = CreateUTF8String( xstr );
	struct htmlLexeme *list = parseHTML( lexinput );

	// If we used to be in comment continuation mode, the assumption is that we're still in 
	// comment continuation mode.  We'll switch this off if the comment ends though
	currentLineParams->CopyState( previousLineParams );

	// We are going to keep track of which open and close tags we've seen on the line.  This allows
	// us to determine which unmatched open and close tags exist so we can associate that data with
	// the line.  As we find open tags, we'll push them onto the open tag list.  As we find close tags,
	// we will scan the open tag list and *remove* any that match.  If there's no match, then we'll add
	// the tag to the close list.
	std::vector< VString > openList, closeList;

	// Given the list of HTML tokens, let's walk over the list and try to make some sense
	// of them.  Walk over the list one token at a time, and see if we can make sense of 
	// what we've got.
	struct htmlLexeme *cur = list;
	while (cur) {
		// There are only three types of comments we need to worry about.  Full comments, 
		// open comments and close comments.  We'll get a token representing any one of the
		// three.  However, we need to pay special attention to multi-line comments, since
		// they won't lex out entirely correct.  If the previous line was part of an open
		// comment, then we want to keep walking over the tokens, marking them as part of
		// the comment, until we run out of tokens, or we find a kCommentClose token.
		if (currentLineParams->IsOpenComment()) {
			if (kCommentClose == cur->fStyle) {
				// We found the end of the comment, so we can highlight it appropriately, 
				// and go back to our regularly scheduled lexing
				inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ comment_col ] );

				// We're also done being a part of the comment continuation train
				currentLineParams->SetIsOpenComment( false );
			} else {
				// This is just another part of the comment
				inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ comment_col ] );
			}
			// Advance
			cur = cur->fNext;
			continue;
		}
		if (kCompleteComment == cur->fStyle) {
			// A complete comment is the easiest of the three cases.  Just highlight it
			inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ comment_col ] );
		} else if (kCommentOpen == cur->fStyle) {
			// An open comment must be the last token in the list
			xbox_assert( !cur->fNext );

			// We want to highlight from here to the end of the line
			inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ comment_col ] );
			// We also want to flag that this line ends with an open comment
			currentLineParams->SetIsOpenComment( true );
		} else if (kCommentClose == cur->fStyle) {
			// If we got a close comment token, then something's off.  That means the user put in a close comment
			// token, but they never opened it.  We're going to ignore that state, and flag this as being normal
			inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ scommentend_col ] );
		} else if (kString == cur->fStyle) {
			inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ string_col ] );
		} else if (kKeyword == cur->fStyle) {
			// Keywords a bit trickier than you might think because we need to be sure they're actually part of a
			// tag.  If the user types something like: <b>This table rocks</b>, we only want to highlight the b in the
			// begin and end tag, and not the "table" in the user's text.  To deal with this, we have an "in tag" flag
			// that basically turns keyword highlighting on and off.
			if (currentLineParams->IsProcessingTag()) {
				inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ keyword_col ] );

				// If we're processing an opening tag, then we want to push the keyword onto the tag stack.  But if we're
				// processing a closing tag, then we want to pop the last keyword off the tag stack and try to match it up
				// to what we just processed.  If they match, we're golden.  If not, we just assume the user's mismatching
				// their tags because they're an idiot.
				VString tagName;
				xstr.GetSubString( cur->fOffset + 1, cur->fLength, tagName );

				if (currentLineParams->IsProcessingStartTag()) {
					if (!IsTagWithoutClose( tagName )) {
						openList.push_back( tagName );
					}
					currentLineParams->PushTag( tagName );

					// Note that we are no longer processing the start of a tag.  This allows us to handle attributes
					// separately from the tag itself.
					currentLineParams->SetIsProcessingStartTag( false );
				} else {
					// Check to see if this closed tag is on the open list.  If it is, we want to remove it from the
					// list.  Otherwise, we want to add it to the close list.
					bool bAddToClose = true;
					for (std::vector< VString >::iterator iter = openList.begin(); bAddToClose && iter != openList.end();) {
						if (tagName.EqualTo( *iter, false )) {
							iter = openList.erase( iter );
							bAddToClose = false;
						} else {
							++iter;
						}
					}
					if (bAddToClose)	closeList.push_back( tagName );

					VString lastTag;
					currentLineParams->PopTag( lastTag );

					if (!lastTag.EqualTo( tagName, false )) {
						// The tags don't match, so we're just going to ignore the issue
						// TODO: do something more sensible here
					}
				}
			}
		} else if (kNumber == cur->fStyle) {
			inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ allnum_col ] );
		} else if (kTagOpen == cur->fStyle || kEndTagOpen == cur->fStyle) {
			currentLineParams->SetIsProcessingTag( true );
			currentLineParams->SetIsProcessingStartTag( kTagOpen == cur->fStyle );
		} else if (kTagClose == cur->fStyle || kTagSelfClose == cur->fStyle) {
			currentLineParams->SetIsProcessingTag( false );

			// If we just handled a self-closing tag (like <br />), then we want to pop it from the stack
			VString lastTag;
			currentLineParams->LastTag( lastTag );
			if (kTagSelfClose == cur->fStyle || IsTagWithoutClose( lastTag )) {
				VString toss;
				currentLineParams->PopTag( toss );

				// We also do not want to add it to our list of open tags for the line, since it's self-closed
				for (std::vector< VString >::iterator iter = openList.begin(); iter != openList.end(); ++iter) {
					if (lastTag.EqualTo( *iter, false )) {
						iter = openList.erase( iter );
						break;
					}
				}
			}
		}

		cur = cur->fNext;
	}
	FreeLexemeList( list );

	// Now that we have an open and a close list, we want to associate them with the line.
	for (std::vector< VString >::iterator iter = openList.begin(); iter != openList.end(); ++iter) {
		currentLineParams->AddUnmatchedOpenTag( *iter );
	}
	for (std::vector< VString >::iterator iter = closeList.begin(); iter != closeList.end(); ++iter) {
		currentLineParams->AddUnmatchedCloseTag( *iter );
	}

	// There are two cases we really need to care about.  If the line now ends in
	// an open comment (and didn't used to), we want to colorize down the document.
	// Also, if the line no longer ends in an open comment (but used to), we want to
	// colorize down the document.  In either case, we want to keep colorizing subsequent
	// lines until the comment is ended or the end of the document is reached.
	if ((!previousOpenCommentState && currentLineParams->IsOpenComment() ||		// Now ends with open comment, didn't used to
		previousOpenCommentState && !currentLineParams->IsOpenComment()) &&		// Used to end with an open comment, but no longer does
		inLineNumber + 1 < inDocument->GetNbLines()) {
		SetLine( inDocument, inLineNumber + 1, inLoading );
	}
#endif // old code
}

Пример #11

0

Показать файл

Файл: htmltable.c Проект: TidyHuang/vizgems

/* make_html_label:
 * Return non-zero if problem parsing HTML. In this case, use object name.
 */
int make_html_label(void *obj, textlabel_t * lp)
{
    int rv;
    double wd2, ht2;
    boxf box;
    graph_t *g;
    htmllabel_t *lbl;
    htmlenv_t env;
    char *s;

    env.obj = obj;
    switch (agobjkind(obj)) {
#ifdef WITH_CGRAPH
    case AGRAPH:
#else
    case AGGRAPH:
#endif
        env.g = ((Agraph_t *) obj)->root;
        break;
    case AGNODE:
        env.g = agraphof(((Agnode_t *) obj));
        break;
    case AGEDGE:
        env.g = agraphof(aghead (((Agedge_t *) obj)));
        break;
    }
    g = env.g->root;

    env.finfo.size = lp->fontsize;
    env.finfo.name = lp->fontname;
    env.finfo.color = lp->fontcolor;
    lbl = parseHTML(lp->text, &rv, GD_charset(env.g));
    if (!lbl) {
	/* Parse of label failed; revert to simple text label */
	agxbuf xb;
	unsigned char buf[SMALLBUF];
	agxbinit(&xb, SMALLBUF, buf);
	lp->html = FALSE;
	lp->text = strdup(nameOf(obj, &xb));
	switch (lp->charset) {
	case CHAR_LATIN1:
	    s = latin1ToUTF8(lp->text);
	    break;
	default: /* UTF8 */
	    s = htmlEntityUTF8(lp->text, env.g);
	    break;
	}
	free(lp->text);
	lp->text = s;
	make_simple_label(g, lp);
	agxbfree(&xb);
	return rv;
    }

    if (lbl->kind == HTML_TBL) {
	if (! lbl->u.tbl->data.pencolor && getPenColor(obj))
	    lbl->u.tbl->data.pencolor = strdup(getPenColor(obj));
	rv |= size_html_tbl(g, lbl->u.tbl, NULL, &env);
	wd2 = (lbl->u.tbl->data.box.UR.x + 1) / 2;
	ht2 = (lbl->u.tbl->data.box.UR.y + 1) / 2;
	box = boxfof(-wd2, -ht2, wd2, ht2);
	pos_html_tbl(lbl->u.tbl, box, BOTTOM | RIGHT | TOP | LEFT);
	lp->dimen.x = box.UR.x - box.LL.x;
	lp->dimen.y = box.UR.y - box.LL.y;
    } else {
	rv |= size_html_txt(g, lbl->u.txt, &env);
	wd2 = (lbl->u.txt->box.UR.x + 1) / 2;
	ht2 = (lbl->u.txt->box.UR.y + 1) / 2;
	box = boxfof(-wd2, -ht2, wd2, ht2);
	lbl->u.txt->box = box;
	lp->dimen.x = box.UR.x - box.LL.x;
	lp->dimen.y = box.UR.y - box.LL.y;
    }

    lp->u.html = lbl;

    /* If the label is a table, replace label text because this may
     * be used for the title and alt fields in image maps.
     */
    if (lbl->kind == HTML_TBL) {
	free (lp->text);
	lp->text = strdup ("<TABLE>");
    }

    return rv;
}

Пример #12

0

Показать файл

Файл: indexer.c Проект: somebodyschelsea/search-engine

int main(int argc, char *argv[])
{
// INPUT VARIABLES
	char* program;
	char* target_dir;
	char* output_file_name;
	char* input_file_name;
	char* rewritten_file_name;	

// determines which mode the program is running in
	int indexer_test_flag;

// overall data structure
	INVERTED_INDEX* index;

// these variables handle the scandir results and pulling information from files
	int numfiles = 0;
	struct dirent **files;	
	char* file_name;
	char* file_contents;

// this variable is used for parsing the HTML
	int file_pos;

// variables used for WordNode (word == key) and DocumentNode (doc_id)
	char* word;
	int doc_id;

	indexer_test_flag = 0; // default is basic funcitonality

	program = argv[0];

// if incorrect number of arguments
	if(argc != 3 && argc != 5)
	{
		fprintf(stderr, "%s: The indexer requires either 2 (a target directory and output file name) or 4 (a target directory, output file name, input file name, and a rewritten file name\n", program);

		return 1;
	}

	target_dir = argv[1];
	output_file_name = argv[2];

// if 5 arguments --> TESTING MODE
	if(argc == 5)
	{
		indexer_test_flag = 1;
		input_file_name = argv[3];
		rewritten_file_name = argv[4];
	}

// if the target directory doesn't exist
	if(!directoryExists(target_dir))
	{
		fprintf(stderr, "%s: Invalid target directory %s\n", program, target_dir);
		return 1;
	}
	
	numfiles = getFileList(target_dir, &files);

	chdir(target_dir);

// if there are no files in the target directory
	if(numfiles <= 0)
	{
		fprintf(stderr, "%s: Error with target directory %s'n", program, target_dir);
		return 1;
	}

	index = initializeDict();

// this for loop goes through each file in "files", pulls each word out of the HTML, and updates the index data structure
	for(int i=0; i < numfiles; i++)
	{
		file_name = files[i]->d_name;

// if it's a regular file (to avoid . and .. files)
		if(regularFile(file_name))
		{
			file_contents = NULL;	
			file_contents = readFile(file_name);
			file_pos = 0;
			doc_id = atoi(file_name);

// just in case a 404 wasn't caught by the crawler
			if(file_contents != NULL)
			{
				word = NULL;
				word = malloc(500*sizeof(char));
				MALLOC_CHECK(word);
				BZERO(word, 500*sizeof(char));

// GetNextWord returns the index in file_contents where it stopped parsing, while assigning a new word to the "word"
				while((file_pos = parseHTML(file_contents, word, file_pos)) != -1)
				{
					updateIndex(word, doc_id, index);

					free(word);
					//word = NULL;
					word = malloc(500*sizeof(char));
					MALLOC_CHECK(word);
					BZERO(word, 500*sizeof(char));
				}

				free(word);
			}

			free(file_contents);
		}

		free(files[i]);
	}

	free(files);

// outputs to a file
	saveFile(index, output_file_name);
	cleanIndex(index);

// if it's in testing mode
	if(indexer_test_flag)
	{
		INVERTED_INDEX* newindex;
		newindex = readIndex(input_file_name);
		saveFile(newindex, rewritten_file_name);
		cleanIndex(newindex);
	}
}

Пример #13

0

Показать файл

Файл: videcnv.cpp Проект: OS2World/DEV-CPLUSPLUS-UTIL-V_portable_C--_GUI_Framework

// ======================>>> vTextEditor::paintLine <<<=====================
  void vedTextEditor::paintLine(char* linout, int lineStart,
        int hiStart, int hiLast, long lineNum)
  {
    // paint a line.
    // linout: the line to output with tabs converted to spaces, etc.
    // lineStart: where to begin printing the line (for hoiz. scrolling)
    // hiStart, hiLast: reverse text attribute
    // lineNum: the real line number in the buffer this is.
    // This version overrides the original to handle syntax highlighting


    ChrAttr attrs[MAX_LINE+1];	// for attributes
    int wasComment = 0;

    int linlen = strlen(linout);
    if (linlen <= 0)             // only draw if there!
        return;

    for (int ix = 0 ; ix <= MAX_LINE ; ++ix)	// assume normal
        attrs[ix] = stdColor[curColors];

    // Parse the line for syntax

    if (GetFileType() == CPP)			// if a C file, parse
        wasComment = parseC(linout,attrs,lineNum,CPP);
    else if (GetFileType() == Java)			// if a Java file, parse
        wasComment = parseC(linout,attrs,lineNum,Java);
    else if (GetFileType() == Perl)			// if a Perl file, parse
        wasComment = parseC(linout,attrs,lineNum,Perl);
    else if (GetFileType() == Fortran)			// if a Fortran file, parse
        wasComment = parseFortran(linout,attrs,lineNum);
    else if (GetFileType() == HTML)			// if a HTML file, parse
        wasComment = parseHTML(linout,attrs,lineNum);
    else if (GetFileType() == TeX)			// if a TeX file, parse
        wasComment = parseTeX(linout,attrs,lineNum);
    else if (GetFileType() == gccError || GetFileType() == bccError)
      {
        int ig = 0;
        while (isSpace(linout[ig]))
          ++ig;
        if (linout[ig] == '>' || linout[ig] == '+')
          {
	    if (linout[ig] == '+')
		attrs[ig++] = dirColor[curColors];
            while (linout[ig])
                attrs[ig++] = keyColor[curColors];
          }
        else if (linout[ig] == '!' || linout[ig] == '*')
          {
	    if (linout[ig] == '*')
		attrs[ig++] = dirColor[curColors];
            while (linout[ig])
                attrs[ig++] = constColor[curColors];
          }
      }

    // Now fill in highlight attributes
    for (int ih = 0 ; linout[ih] != 0 ; ++ih)
      {
        if (ih >= hiStart && ih < hiLast)
            attrs[ih] = getHighlight();		// override syntax colors
      }

    if ( ((videApp*)theApp)->isBP(((VCmdWindow*)_parent)->GetFileName(),
		lineNum))
      {
	for (int ij = 0 ; linout[ij] != 0 ; ++ij)
	  {
	    attrs[ij] = ChBlackOnBG | ChYellow;
	  }
      }
    if ( (((videApp*)theApp)->getBreakPoints())->
		checkIfCurPC(((VCmdWindow*)_parent)->GetFileName(),
		lineNum))
      {
	for (int ij = 0 ; linout[ij] != 0 ; ++ij)
	  {
	    attrs[ij] = ChBlackOnBG | ChRed;
	  }
      }

//  new - V version 1.23 - added DrawAttrsText
    DrawAttrsText((const char*)&linout[lineStart], (const ChrAttr*)&attrs[lineStart]);

//   old code - dumb, slow way to do this
//    for (int ixx = lineStart ; linout[ixx] != 0 ; ++ixx)
//	DrawChar(linout[ixx],attrs[ixx]);

  }