int main(int argc, char *argv[]) { FILE *fh = fopen(argv[1], "r"); int fsz = 0; char *data; fseek(fh, 0L, SEEK_END); fsz = ftell(fh); data = (char *)malloc(sizeof(char)*fsz+1); fseek(fh, 0L, SEEK_SET); fread(data, fsz, 1, fh); data[fsz] = '\0'; fclose(fh); page *dom = parseHTML(data); dom->curr = dom->il; for(;dom->curr != NULL; dom->curr = dom->curr->nitem) { if(dom->curr->type >= 0 && dom->curr->type < elements) { printf("%s\n", element[dom->curr->type]); if(dom->curr->a != NULL) { if(dom->curr->a->innerhtml != NULL) printf("\tINNERHTML:%s\n", dom->curr->a->innerhtml); if(dom->curr->a->innerhtml != NULL) printf("\tCLASS:%s\n", dom->curr->a->class); } printf("\n"); } }
static int readFile(rpmtget tget) { int rc; int xx; xx = tgetInit(tget, 8 * BUFSIZ); xx = rpmswEnter(tget->gop, 0); if (S_ISDIR(tget->sb.st_mode)) { rc = tgetFill(tget); } else if (S_ISREG(tget->sb.st_mode)) { rc = tgetFill(tget); } else rc = -1; xx = rpmswExit(tget->gop, tget->nbuf); if (rc < 0) goto exit; rc = parseHTML(tget); exit: xx = tgetFini(tget); return rc; }
void Browser::loadHTML(string filename) { string str; string allText = ""; ifstream infile; infile.open(filename.c_str()); if( !infile.is_open() ) { cout << "ERROR Loading HTML file!\n"; return; } while(!infile.eof()) // To get you all the lines. { getline(infile,str); // Saves the line in STRING. allText += str + "\n"; } infile.close(); // add the current page as the 0-th link links[0] = filename; // ensure pages is initialized in case parse fails pages[0] = ""; parseHTML(allText); displayText = pages[0]; }
Page::Page(string url, string httpResponse){ url_ = url; // const char* text = httpResponse.c_str(); // if( HTML::detect_utf8(text, httpResponse.size()) == false ){ // CharsetConverter cc("8859_1", "UTF8"); // httpResponse = string(cc.convert(text)); // } parseHTML(HTML::single_blank(HTML::decode_entities(httpResponse))); }
void GBparseHTML(const char *data, const size_t lendata, GB_ARRAY *array) { size_t nodeCount; size_t i = 0; Node **nodes = parseHTML(data, lendata, &nodeCount); GB.Array.New(array, GB.FindClass("XmlNode"), nodeCount); for(i = 0; i < nodeCount; ++i) { *(reinterpret_cast<void **>((GB.Array.Get(*array, i)))) = XML.XMLNode_GetGBObject(nodes[i]); GB.Ref(nodes[i]->GBObject); } free(nodes); }
JNIEXPORT jobjectArray JNICALL Java_com_ireeed_XPathApplier_htmlEvalXPaths(JNIEnv *env, jobject jobj, jstring page, jobjectArray xpaths){ jboolean isCopy; const char *data = env->GetStringUTFChars(page,&isCopy); xmlDocPtr doc = parseHTML(data,strlen(data)); //inform jvm to release the allocated data, no matter copied is true or false env->ReleaseStringUTFChars(page,data); xmlXPathObjectPtr result; xmlNodeSetPtr nodeset; std::vector<std::vector<std::string> > ret; jsize xpathNum = env->GetArrayLength(xpaths); if(doc){ for(int index = 0; index < xpathNum; index++){ jstring string = (jstring)env->GetObjectArrayElement(xpaths,index); const char* xpath = env->GetStringUTFChars(string, &isCopy); std::vector<std::string> output; if(xpath){ result = getNodeSet(doc,xpath); if(result){ nodeset = result->nodesetval; for(int i = 0; i < nodeset->nodeNr; i++){ xmlNodePtr nodePtr = nodeset->nodeTab[i]; std::string record; getTextInNode(nodePtr,record); output.push_back(record); } xmlXPathFreeObject(result); } //inform jvm to release the allocated data, no matter copied is true or false env->ReleaseStringUTFChars(string,xpath); } ret.push_back(output); } xmlFreeDoc(doc); } /* construct std::vector<std::vector<std::string> > back to java type(array of array) construct the final result in order according to the order of given xpaths */ jclass stringArrayCls = env->FindClass("java/lang/Object"); jobjectArray elemObj = env->NewObjectArray(0,env->FindClass("java/lang/String"),env->NewStringUTF("")); jobjectArray retVal = env->NewObjectArray(ret.size(),stringArrayCls,elemObj); for(int i = 0; i < ret.size(); i++){ std::vector<std::string> &elem = ret[i]; elemObj = env->NewObjectArray(elem.size(),env->FindClass("java/lang/String"),env->NewStringUTF("")); for(int j = 0; j < elem.size(); j++){ std::string &elemField = elem[j]; //create java string env->SetObjectArrayElement(elemObj,j,env->NewStringUTF(elemField.c_str())); } env->SetObjectArrayElement(retVal,i,elemObj); } return retVal; }
/* make_html_label: * Return non-zero if problem parsing HTML. In this case, use object name. */ int make_html_label(graph_t *g, textlabel_t * lp, void *obj) { int rv; int wd2, ht2; box box; htmllabel_t *lbl; htmlenv_t env; env.obj = obj; switch (agobjkind(obj)) { case AGGRAPH: env.g = ((Agraph_t *) obj)->root; break; case AGNODE: env.g = ((Agnode_t *) obj)->graph; break; case AGEDGE: env.g = ((Agedge_t *) obj)->head->graph; break; } env.finfo.size = lp->fontsize; env.finfo.name = lp->fontname; env.finfo.color = lp->fontcolor; lbl = parseHTML(lp->text, &rv, GD_charset(env.g)); if (!lbl) { /* Parse of label failed; revert to simple text label */ agxbuf xb; unsigned char buf[SMALLBUF]; agxbinit(&xb, SMALLBUF, buf); lp->html = FALSE; lp->text = strdup(nameOf(obj, &xb)); size_label(env.g, lp); agxbfree(&xb); return rv; } if (lbl->kind == HTML_TBL) { lbl->u.tbl->data.pencolor = getPenColor(obj); rv |= size_html_tbl(g, lbl->u.tbl, NULL, &env); wd2 = (lbl->u.tbl->data.box.UR.x + 1) / 2; ht2 = (lbl->u.tbl->data.box.UR.y + 1) / 2; box = boxof(-wd2, -ht2, wd2, ht2); pos_html_tbl(lbl->u.tbl, box, BOTTOM | RIGHT | TOP | LEFT); lp->dimen.x = box.UR.x - box.LL.x; lp->dimen.y = box.UR.y - box.LL.y; } else { rv |= size_html_txt(g, lbl->u.txt, &env); wd2 = (lbl->u.txt->box.UR.x + 1) / 2; ht2 = (lbl->u.txt->box.UR.y + 1) / 2; box = boxof(-wd2, -ht2, wd2, ht2); lbl->u.txt->box = box; lp->dimen.x = box.UR.x - box.LL.x; lp->dimen.y = box.UR.y - box.LL.y; } lp->u.html = lbl; /* If the label is a table, replace label text because this may * be used for the title and alt fields in image maps. */ if (lbl->kind == HTML_TBL) { free (lp->text); lp->text = strdup ("<TABLE>"); } return rv; }
/* ========================================================================== */ int main(int argc, char* argv[]) { int filenum=1; int initdepth=1; // check command line arguments if (argcheck(argc,argv) == 1){ exit(1); } char *starturl = argv[1]; char *targetdir = argv[2]; int depth = atoi(argv[3]); //initialize our hashtables and url list HashTable *myhashtable; List *mylist; myhashtable=initializeHashTable(); mylist=initializelist(); // init curl curl_global_init(CURL_GLOBAL_ALL); // setup seed page WebPage *startpage = createWebPage(starturl,0); // get seed webpage. If the url is invalid, quit and send an error message. if ( GetWebPage(startpage) == 0 ){ printf("The url that you entered was invalid. Please try again."); free(startpage->html); free(startpage); exit(1); } // write seed file createfile(startpage,targetdir,filenum); filenum++; // add seed page to hashtable insertHashTable(myhashtable,startpage->url); // extract urls from seed page if( depth > 0 ){ parseHTML(startpage,mylist,myhashtable,initdepth); } // while there are urls to crawl while ( mylist->head != NULL ){ // get next url from list WebPage *nextpage = listpop(mylist); int currdepth = nextpage->depth; // get webpage for url // If the url is invalid, quit and free the memory if (GetWebPage(nextpage) != 0 ){ createfile(nextpage,targetdir,filenum); filenum++; // extract urls from webpage if ( currdepth < depth ){ parseHTML(nextpage, mylist, myhashtable, currdepth+1); } } free(nextpage->html); free(nextpage->url); free(nextpage); sleep(SLEEPTIME); } // cleanup curl free(startpage->html); free(startpage); freeHashTable(myhashtable); freelist(mylist); curl_global_cleanup(); return 0; }
void VHTMLSyntax::GetSuggestions( ICodeEditorDocument* inDocument, sLONG inLineNumber, sLONG inPos, ITipInfoArray *outSuggestions, sLONG& outStartOffset, bool inAll ) { // Get the text for the line up to the point of insertion, and we'll lex that to see if we can come up // with some rational suggestions for the user. VString xstr; inDocument->GetLine( inLineNumber, xstr ); xstr.Truncate( inPos ); char *lexinput = CreateUTF8String( xstr ); struct htmlLexeme *list = parseHTML( lexinput ); // Gin up some line params for tracking state information VLineSyntaxParams *currentLineParams = currentLineParams = new VLineSyntaxParams(); if (inLineNumber > 0) { // We're starting where we left off on the previous line currentLineParams->CopyState( static_cast< VLineSyntaxParams * >( inDocument->GetLineSyntaxParams( inLineNumber - 1 ) ) ); } // Given the list of HTML tokens, let's walk over the list and try to make some sense // of them. Walk over the list one token at a time, and see if we can make sense of // what we've got. This is going to be awfully similar to the way we do things in the // SetLine method, except that we're not actually updating the line state for the current // line. Instead, we're working on a copy of the existing information. struct htmlLexeme *cur = list; int lastTokenProcessed = 0; while (cur) { if (kKeyword == cur->fStyle) { lastTokenProcessed = 3; // Keywords a bit trickier than you might think because we need to be sure they're actually part of a // tag. If the user types something like: <b>This table rocks</b>, we only want to highlight the b in the // begin and end tag, and not the "table" in the user's text. To deal with this, we have an "in tag" flag // that basically turns keyword highlighting on and off. if (currentLineParams->IsProcessingTag()) { // If we're processing an opening tag, then we want to push the keyword onto the tag stack. But if we're // processing a closing tag, then we want to pop the last keyword off the tag stack and try to match it up // to what we just processed. If they match, we're golden. If not, we just assume the user's mismatching // their tags because they're an idiot. VString tagName; xstr.GetSubString( cur->fOffset + 1, cur->fLength, tagName ); if (currentLineParams->IsProcessingStartTag()) { currentLineParams->PushTag( tagName ); // Note that we are no longer processing the start of a tag. This allows us to handle attributes // separately from the tag itself. currentLineParams->SetIsProcessingStartTag( false ); } else { VString lastTag; currentLineParams->PopTag( lastTag ); if (!lastTag.EqualTo( tagName, false )) { // The tags don't match, so we're just going to ignore the issue // TODO: do something more sensible here } } } } else if (kTagOpen == cur->fStyle || kEndTagOpen == cur->fStyle) { lastTokenProcessed = (kTagOpen == cur->fStyle) ? 1 : 2; currentLineParams->SetIsProcessingTag( true ); currentLineParams->SetIsProcessingStartTag( kTagOpen == cur->fStyle ); } else if (kTagClose == cur->fStyle || kTagSelfClose == cur->fStyle) { lastTokenProcessed = 0; currentLineParams->SetIsProcessingTag( false ); // If we just handled a self-closing tag (like <br />), then we want to pop it from the stack // TODO: some tags can't have matching pairs, like <br>, so even if it's not self-closing, we want // to pop it off the tag stack. Handle that here if (kTagSelfClose == cur->fStyle) { VString toss; currentLineParams->PopTag( toss ); } } else { lastTokenProcessed = 0; } cur = cur->fNext; } if (lastTokenProcessed == 1) { // We processed a tag opener, but no keyword for the tag. So let's make a bunch of suggestions! } else if (lastTokenProcessed == 2) { // We processed a tag closer, but no keyword for the tag. Grab the last opened tag from the list // and suggest it as the closer VString suggestion; currentLineParams->LastTag( suggestion ); outSuggestions->AddTip( new VCodeEditorTipInfo( inDocument, suggestion, htmlcolorShadow[ keyword_col ] ) ); } delete currentLineParams; FreeLexemeList( list ); }
void VHTMLSyntax::SetLine( ICodeEditorDocument *inDocument, sLONG inLineNumber, bool inLoading ) { #if 0 VString source; inDocument->GetLine( inLineNumber, source ); HTMLParser parser; HTMLParser::State *state = NULL; HTMLParser::State *prevLineState = NULL; if (inLineNumber > 0) prevLineState = GetStateForLine( inDocument, inLineNumber - 1 ); ParsingCookie *cookie = new ParsingCookie( inDocument, inLineNumber ); parser.Parse( source, prevLineState, &state, this, (const void *)cookie ); SetStateForLine( inDocument, inLineNumber, state ); cookie->Release(); #else // Get the params for the current line so that we can set them up properly VLineSyntaxParams *currentLineParams = static_cast< VLineSyntaxParams * >( inDocument->GetLineSyntaxParams( inLineNumber ) ); if (!currentLineParams) { currentLineParams = new VLineSyntaxParams(); inDocument->AssignLineSyntaxParams( inLineNumber, currentLineParams ); } bool previousOpenCommentState = currentLineParams->IsOpenComment(); // We also want the params for the preceeding line, in case we're the continuation of // a comment. VLineSyntaxParams *previousLineParams = NULL; if (inLineNumber > 0) { previousLineParams = static_cast< VLineSyntaxParams * >( inDocument->GetLineSyntaxParams( inLineNumber - 1 ) ); } VString xstr; inDocument->GetLine(inLineNumber,xstr); inDocument->SetLineStyle(inLineNumber,0,xstr.GetLength(),0); //initiate the line char *lexinput = CreateUTF8String( xstr ); struct htmlLexeme *list = parseHTML( lexinput ); // If we used to be in comment continuation mode, the assumption is that we're still in // comment continuation mode. We'll switch this off if the comment ends though currentLineParams->CopyState( previousLineParams ); // We are going to keep track of which open and close tags we've seen on the line. This allows // us to determine which unmatched open and close tags exist so we can associate that data with // the line. As we find open tags, we'll push them onto the open tag list. As we find close tags, // we will scan the open tag list and *remove* any that match. If there's no match, then we'll add // the tag to the close list. std::vector< VString > openList, closeList; // Given the list of HTML tokens, let's walk over the list and try to make some sense // of them. Walk over the list one token at a time, and see if we can make sense of // what we've got. struct htmlLexeme *cur = list; while (cur) { // There are only three types of comments we need to worry about. Full comments, // open comments and close comments. We'll get a token representing any one of the // three. However, we need to pay special attention to multi-line comments, since // they won't lex out entirely correct. If the previous line was part of an open // comment, then we want to keep walking over the tokens, marking them as part of // the comment, until we run out of tokens, or we find a kCommentClose token. if (currentLineParams->IsOpenComment()) { if (kCommentClose == cur->fStyle) { // We found the end of the comment, so we can highlight it appropriately, // and go back to our regularly scheduled lexing inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ comment_col ] ); // We're also done being a part of the comment continuation train currentLineParams->SetIsOpenComment( false ); } else { // This is just another part of the comment inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ comment_col ] ); } // Advance cur = cur->fNext; continue; } if (kCompleteComment == cur->fStyle) { // A complete comment is the easiest of the three cases. Just highlight it inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ comment_col ] ); } else if (kCommentOpen == cur->fStyle) { // An open comment must be the last token in the list xbox_assert( !cur->fNext ); // We want to highlight from here to the end of the line inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ comment_col ] ); // We also want to flag that this line ends with an open comment currentLineParams->SetIsOpenComment( true ); } else if (kCommentClose == cur->fStyle) { // If we got a close comment token, then something's off. That means the user put in a close comment // token, but they never opened it. We're going to ignore that state, and flag this as being normal inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ scommentend_col ] ); } else if (kString == cur->fStyle) { inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ string_col ] ); } else if (kKeyword == cur->fStyle) { // Keywords a bit trickier than you might think because we need to be sure they're actually part of a // tag. If the user types something like: <b>This table rocks</b>, we only want to highlight the b in the // begin and end tag, and not the "table" in the user's text. To deal with this, we have an "in tag" flag // that basically turns keyword highlighting on and off. if (currentLineParams->IsProcessingTag()) { inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ keyword_col ] ); // If we're processing an opening tag, then we want to push the keyword onto the tag stack. But if we're // processing a closing tag, then we want to pop the last keyword off the tag stack and try to match it up // to what we just processed. If they match, we're golden. If not, we just assume the user's mismatching // their tags because they're an idiot. VString tagName; xstr.GetSubString( cur->fOffset + 1, cur->fLength, tagName ); if (currentLineParams->IsProcessingStartTag()) { if (!IsTagWithoutClose( tagName )) { openList.push_back( tagName ); } currentLineParams->PushTag( tagName ); // Note that we are no longer processing the start of a tag. This allows us to handle attributes // separately from the tag itself. currentLineParams->SetIsProcessingStartTag( false ); } else { // Check to see if this closed tag is on the open list. If it is, we want to remove it from the // list. Otherwise, we want to add it to the close list. bool bAddToClose = true; for (std::vector< VString >::iterator iter = openList.begin(); bAddToClose && iter != openList.end();) { if (tagName.EqualTo( *iter, false )) { iter = openList.erase( iter ); bAddToClose = false; } else { ++iter; } } if (bAddToClose) closeList.push_back( tagName ); VString lastTag; currentLineParams->PopTag( lastTag ); if (!lastTag.EqualTo( tagName, false )) { // The tags don't match, so we're just going to ignore the issue // TODO: do something more sensible here } } } } else if (kNumber == cur->fStyle) { inDocument->SetLineStyle( inLineNumber, cur->fOffset, cur->fOffset + cur->fLength, htmlcolorShadow[ allnum_col ] ); } else if (kTagOpen == cur->fStyle || kEndTagOpen == cur->fStyle) { currentLineParams->SetIsProcessingTag( true ); currentLineParams->SetIsProcessingStartTag( kTagOpen == cur->fStyle ); } else if (kTagClose == cur->fStyle || kTagSelfClose == cur->fStyle) { currentLineParams->SetIsProcessingTag( false ); // If we just handled a self-closing tag (like <br />), then we want to pop it from the stack VString lastTag; currentLineParams->LastTag( lastTag ); if (kTagSelfClose == cur->fStyle || IsTagWithoutClose( lastTag )) { VString toss; currentLineParams->PopTag( toss ); // We also do not want to add it to our list of open tags for the line, since it's self-closed for (std::vector< VString >::iterator iter = openList.begin(); iter != openList.end(); ++iter) { if (lastTag.EqualTo( *iter, false )) { iter = openList.erase( iter ); break; } } } } cur = cur->fNext; } FreeLexemeList( list ); // Now that we have an open and a close list, we want to associate them with the line. for (std::vector< VString >::iterator iter = openList.begin(); iter != openList.end(); ++iter) { currentLineParams->AddUnmatchedOpenTag( *iter ); } for (std::vector< VString >::iterator iter = closeList.begin(); iter != closeList.end(); ++iter) { currentLineParams->AddUnmatchedCloseTag( *iter ); } // There are two cases we really need to care about. If the line now ends in // an open comment (and didn't used to), we want to colorize down the document. // Also, if the line no longer ends in an open comment (but used to), we want to // colorize down the document. In either case, we want to keep colorizing subsequent // lines until the comment is ended or the end of the document is reached. if ((!previousOpenCommentState && currentLineParams->IsOpenComment() || // Now ends with open comment, didn't used to previousOpenCommentState && !currentLineParams->IsOpenComment()) && // Used to end with an open comment, but no longer does inLineNumber + 1 < inDocument->GetNbLines()) { SetLine( inDocument, inLineNumber + 1, inLoading ); } #endif // old code }
/* make_html_label: * Return non-zero if problem parsing HTML. In this case, use object name. */ int make_html_label(void *obj, textlabel_t * lp) { int rv; double wd2, ht2; boxf box; graph_t *g; htmllabel_t *lbl; htmlenv_t env; char *s; env.obj = obj; switch (agobjkind(obj)) { #ifdef WITH_CGRAPH case AGRAPH: #else case AGGRAPH: #endif env.g = ((Agraph_t *) obj)->root; break; case AGNODE: env.g = agraphof(((Agnode_t *) obj)); break; case AGEDGE: env.g = agraphof(aghead (((Agedge_t *) obj))); break; } g = env.g->root; env.finfo.size = lp->fontsize; env.finfo.name = lp->fontname; env.finfo.color = lp->fontcolor; lbl = parseHTML(lp->text, &rv, GD_charset(env.g)); if (!lbl) { /* Parse of label failed; revert to simple text label */ agxbuf xb; unsigned char buf[SMALLBUF]; agxbinit(&xb, SMALLBUF, buf); lp->html = FALSE; lp->text = strdup(nameOf(obj, &xb)); switch (lp->charset) { case CHAR_LATIN1: s = latin1ToUTF8(lp->text); break; default: /* UTF8 */ s = htmlEntityUTF8(lp->text, env.g); break; } free(lp->text); lp->text = s; make_simple_label(g, lp); agxbfree(&xb); return rv; } if (lbl->kind == HTML_TBL) { if (! lbl->u.tbl->data.pencolor && getPenColor(obj)) lbl->u.tbl->data.pencolor = strdup(getPenColor(obj)); rv |= size_html_tbl(g, lbl->u.tbl, NULL, &env); wd2 = (lbl->u.tbl->data.box.UR.x + 1) / 2; ht2 = (lbl->u.tbl->data.box.UR.y + 1) / 2; box = boxfof(-wd2, -ht2, wd2, ht2); pos_html_tbl(lbl->u.tbl, box, BOTTOM | RIGHT | TOP | LEFT); lp->dimen.x = box.UR.x - box.LL.x; lp->dimen.y = box.UR.y - box.LL.y; } else { rv |= size_html_txt(g, lbl->u.txt, &env); wd2 = (lbl->u.txt->box.UR.x + 1) / 2; ht2 = (lbl->u.txt->box.UR.y + 1) / 2; box = boxfof(-wd2, -ht2, wd2, ht2); lbl->u.txt->box = box; lp->dimen.x = box.UR.x - box.LL.x; lp->dimen.y = box.UR.y - box.LL.y; } lp->u.html = lbl; /* If the label is a table, replace label text because this may * be used for the title and alt fields in image maps. */ if (lbl->kind == HTML_TBL) { free (lp->text); lp->text = strdup ("<TABLE>"); } return rv; }
int main(int argc, char *argv[]) { // INPUT VARIABLES char* program; char* target_dir; char* output_file_name; char* input_file_name; char* rewritten_file_name; // determines which mode the program is running in int indexer_test_flag; // overall data structure INVERTED_INDEX* index; // these variables handle the scandir results and pulling information from files int numfiles = 0; struct dirent **files; char* file_name; char* file_contents; // this variable is used for parsing the HTML int file_pos; // variables used for WordNode (word == key) and DocumentNode (doc_id) char* word; int doc_id; indexer_test_flag = 0; // default is basic funcitonality program = argv[0]; // if incorrect number of arguments if(argc != 3 && argc != 5) { fprintf(stderr, "%s: The indexer requires either 2 (a target directory and output file name) or 4 (a target directory, output file name, input file name, and a rewritten file name\n", program); return 1; } target_dir = argv[1]; output_file_name = argv[2]; // if 5 arguments --> TESTING MODE if(argc == 5) { indexer_test_flag = 1; input_file_name = argv[3]; rewritten_file_name = argv[4]; } // if the target directory doesn't exist if(!directoryExists(target_dir)) { fprintf(stderr, "%s: Invalid target directory %s\n", program, target_dir); return 1; } numfiles = getFileList(target_dir, &files); chdir(target_dir); // if there are no files in the target directory if(numfiles <= 0) { fprintf(stderr, "%s: Error with target directory %s'n", program, target_dir); return 1; } index = initializeDict(); // this for loop goes through each file in "files", pulls each word out of the HTML, and updates the index data structure for(int i=0; i < numfiles; i++) { file_name = files[i]->d_name; // if it's a regular file (to avoid . and .. files) if(regularFile(file_name)) { file_contents = NULL; file_contents = readFile(file_name); file_pos = 0; doc_id = atoi(file_name); // just in case a 404 wasn't caught by the crawler if(file_contents != NULL) { word = NULL; word = malloc(500*sizeof(char)); MALLOC_CHECK(word); BZERO(word, 500*sizeof(char)); // GetNextWord returns the index in file_contents where it stopped parsing, while assigning a new word to the "word" while((file_pos = parseHTML(file_contents, word, file_pos)) != -1) { updateIndex(word, doc_id, index); free(word); //word = NULL; word = malloc(500*sizeof(char)); MALLOC_CHECK(word); BZERO(word, 500*sizeof(char)); } free(word); } free(file_contents); } free(files[i]); } free(files); // outputs to a file saveFile(index, output_file_name); cleanIndex(index); // if it's in testing mode if(indexer_test_flag) { INVERTED_INDEX* newindex; newindex = readIndex(input_file_name); saveFile(newindex, rewritten_file_name); cleanIndex(newindex); } }
// ======================>>> vTextEditor::paintLine <<<===================== void vedTextEditor::paintLine(char* linout, int lineStart, int hiStart, int hiLast, long lineNum) { // paint a line. // linout: the line to output with tabs converted to spaces, etc. // lineStart: where to begin printing the line (for hoiz. scrolling) // hiStart, hiLast: reverse text attribute // lineNum: the real line number in the buffer this is. // This version overrides the original to handle syntax highlighting ChrAttr attrs[MAX_LINE+1]; // for attributes int wasComment = 0; int linlen = strlen(linout); if (linlen <= 0) // only draw if there! return; for (int ix = 0 ; ix <= MAX_LINE ; ++ix) // assume normal attrs[ix] = stdColor[curColors]; // Parse the line for syntax if (GetFileType() == CPP) // if a C file, parse wasComment = parseC(linout,attrs,lineNum,CPP); else if (GetFileType() == Java) // if a Java file, parse wasComment = parseC(linout,attrs,lineNum,Java); else if (GetFileType() == Perl) // if a Perl file, parse wasComment = parseC(linout,attrs,lineNum,Perl); else if (GetFileType() == Fortran) // if a Fortran file, parse wasComment = parseFortran(linout,attrs,lineNum); else if (GetFileType() == HTML) // if a HTML file, parse wasComment = parseHTML(linout,attrs,lineNum); else if (GetFileType() == TeX) // if a TeX file, parse wasComment = parseTeX(linout,attrs,lineNum); else if (GetFileType() == gccError || GetFileType() == bccError) { int ig = 0; while (isSpace(linout[ig])) ++ig; if (linout[ig] == '>' || linout[ig] == '+') { if (linout[ig] == '+') attrs[ig++] = dirColor[curColors]; while (linout[ig]) attrs[ig++] = keyColor[curColors]; } else if (linout[ig] == '!' || linout[ig] == '*') { if (linout[ig] == '*') attrs[ig++] = dirColor[curColors]; while (linout[ig]) attrs[ig++] = constColor[curColors]; } } // Now fill in highlight attributes for (int ih = 0 ; linout[ih] != 0 ; ++ih) { if (ih >= hiStart && ih < hiLast) attrs[ih] = getHighlight(); // override syntax colors } if ( ((videApp*)theApp)->isBP(((VCmdWindow*)_parent)->GetFileName(), lineNum)) { for (int ij = 0 ; linout[ij] != 0 ; ++ij) { attrs[ij] = ChBlackOnBG | ChYellow; } } if ( (((videApp*)theApp)->getBreakPoints())-> checkIfCurPC(((VCmdWindow*)_parent)->GetFileName(), lineNum)) { for (int ij = 0 ; linout[ij] != 0 ; ++ij) { attrs[ij] = ChBlackOnBG | ChRed; } } // new - V version 1.23 - added DrawAttrsText DrawAttrsText((const char*)&linout[lineStart], (const ChrAttr*)&attrs[lineStart]); // old code - dumb, slow way to do this // for (int ixx = lineStart ; linout[ixx] != 0 ; ++ixx) // DrawChar(linout[ixx],attrs[ixx]); }