char * parser_get_next_word(char **sp) { static char buffer[512]; char *s = *sp, *p = buffer; while (is_separator(*s)) ++s; if (*s == '\0') goto done; if (is_punctuation(*s)) { *p++ = *s++; goto done; } while (*s && !is_separator(*s) && !is_punctuation(*s)) *p++ = *s++; done: *p++ = '\0'; *sp = s; return buffer[0]? buffer : NULL; }
text_property ucs_text_language_rep::advance (tree t, int& pos) { //TODO: replace methods is_punctuation (), is_iso_alpha () and is_numeric (), // by equivalents taking into account unicode entities. string s= t->label; if (pos == N(s)) return &tp_normal_rep; if (s[pos]==' ') { pos++; // while ((pos<N(s)) && (s[pos]==' ')) pos++; if ((pos == N(s)) || (!is_punctuation (s[pos]))) return &tp_space_rep; return &tp_blank_rep; } if (is_punctuation (s[pos])) { while ((pos<N(s)) && is_punctuation (s[pos])) pos++; if ((pos==N(s)) || (s[pos]!=' ')) return &tp_normal_rep; switch (s[pos-1]) { case ',': case ':': case ';': case '`': case '\'': return &tp_space_rep; case '.': case '!': case '?': return &tp_period_rep; } return &tp_space_rep; } if (s[pos]=='-') { pos++; while ((pos<N(s)) && (s[pos]=='-')) pos++; return &tp_hyph_rep; } if (is_iso_alpha (s[pos]) || (s[pos]=='<')) { while ((pos<N(s)) && (is_iso_alpha (s[pos]) || (s[pos]=='<'))) { if (s[pos]=='<') { while ((pos<N(s)) && (s[pos]!='>')) pos++; if (pos<N(s)) pos++; } else pos++; } return &tp_normal_rep; } if (is_numeric (s[pos])) { // can not be a '.' while ((pos<N(s)) && is_numeric (s[pos])) pos++; while (s[pos-1]=='.') pos--; return &tp_normal_rep; } pos++; return &tp_normal_rep; }
text_property text_language_rep::advance (tree t, int& pos) { string s= t->label; if (pos == N(s)) return &tp_normal_rep; if (s[pos]==' ') { pos++; // while ((pos<N(s)) && (s[pos]==' ')) pos++; if ((pos == N(s)) || (!is_punctuation (s[pos]))) return &tp_space_rep; return &tp_blank_rep; } if (is_punctuation (s[pos])) { while ((pos<N(s)) && is_punctuation (s[pos])) pos++; if ((pos==N(s)) || (s[pos]!=' ')) return &tp_normal_rep; switch (s[pos-1]) { case ',': case ':': case ';': case '`': case '\'': return &tp_space_rep; case '.': case '!': case '?': return &tp_period_rep; } return &tp_space_rep; } if (s[pos]=='-') { pos++; while ((pos<N(s)) && (s[pos]=='-')) pos++; return &tp_hyph_rep; } if (is_iso_alpha (s[pos])) { while ((pos<N(s)) && is_iso_alpha (s[pos])) pos++; return &tp_normal_rep; } if (is_numeric (s[pos])) { // can not be a '.' while ((pos<N(s)) && is_numeric (s[pos])) pos++; while (s[pos-1]=='.') pos--; return &tp_normal_rep; } if (s[pos]=='<') { while ((pos<N(s)) && (s[pos]!='>')) pos++; if (pos<N(s)) pos++; return &tp_normal_rep; } pos++; return &tp_normal_rep; }
/** * store the results to words * return the number of words found in dict */ int get_next_words(char *words[MAX_NEXT_WORDS]) { int n = 0; /* number of words found */ off_t pos = g_pos; off_t end = g_pos + MAX_WORD_LENGTH + 1; if (end > g_len) end = g_len; while (pos < end && (g_text[pos] & 0x80)) { if (is_punctuation(g_text + pos)) break; pos += 2; if ((words[n] = dict_get_word(g_text + g_pos, pos - g_pos))) { n++; if (n == MAX_NEXT_WORDS) { DEBUG("MAX_NEXT_WORDS reach\n"); DEBUG1("%s\n", g_text + g_pos); int i; for (i = 0; i<n; i++) DEBUG1("%s ", words[i]); break; } } } return n; }
Iterator skip_pre_terminals(const hypergraph_type& graph, const Tails& tails, Iterator first, Iterator iter, Iterator last) const { for (/**/; iter != first; -- iter) { const symbol_type& cat = *(iter - 1); if (! is_punctuation(cat.non_terminal())) return iter - 1; // cat is punctuation... int pos = cat.non_terminal_index() - 1; if (pos < 0) pos = (iter - 1) - first; if (pos >= static_cast<int>(tails.size())) throw std::runtime_error("invalid tails"); if (graph.nodes[tails[pos]].edges.empty()) throw std::runtime_error("no edges"); const edge_type& edge = graph.edges[graph.nodes[tails[pos]].edges.front()]; // we have tail, meaning that cat is NOT pre-terminal if (! edge.tails.empty()) return iter - 1; } return last; }
static int breaking_force (char c) { if (c == ' ') return 3; if (is_punctuation (c)) return 2; if (is_iso_alpha (c) || is_digit (c)) return 0; return 1; }
int parse_match(char *str, char *pattern) { int i; int j; i = 0; j = 0; while (str[i] != 0 && pattern[j] != 0) { if (pattern[j] == 'a' && (str[i] > 122 || str[i] < 97)) return (0); else if (pattern[j] == 'A' && (str[i] > 90 || str[i] < 65)) return (0); else if (pattern[j] == 'i' && (str[i] > 57 || str[i] < 48)) return (0); else if (pattern[j] == '.' && !is_punctuation(pattern[i])) return (0); else if (pattern[j] == '*' && !parse_match(str + i, pattern + j + 1)) ++i; else if (pattern[j] != 'a' && pattern[j] != 'A' && pattern[j] != 'i' && pattern[j] != '.' && pattern[j] != '*') return (0); else { ++i; ++j; } } return (pattern[j] == 0 ? 1 : 0); }
/// \ingroup waUtility /// \fn string extract_text( const string &text, const int option, const size_t len ) /// 全角半角字符转换并提取正文 /// \param text 源字符串 /// \param option 过滤范围选项,可选值组合有 /// - EXTRACT_ALPHA 过滤字母 /// - EXTRACT_DIGIT 过滤数字 /// - EXTRACT_PUNCT 过滤标点 /// - EXTRACT_SPACE 过滤空白 /// - EXTRACT_HTML 过滤HTML代码 /// - 默认值为EXTRACT_ALL即以上全部 /// \param len 过滤长度,大于0时只截取前len个有效字符,默认为0 /// \return 转换提取结果字符串,若源字符串内容被全部过滤则返回空 string extract_text( const string &text, const int option, const size_t len ) { if ( text=="" || option<=0 ) return text; string converted = sbc_to_dbc( text ); // is HTML if ( option&EXTRACT_HTML ) converted = extract_html( converted ); if ( option == EXTRACT_HTML ) return converted; string extracted; extracted.reserve( text.length() ); for ( unsigned int i=0; i<converted.length(); ++i ) { unsigned char c = converted[i]; if ( isalpha(c) ) c = tolower( c ); // is GBK char if ( !is_punctuation(c) && !isalpha(c) && ((c>=0x81&&c<=0xFE) || (c>=0x40&&c<=0x7E) || (c>=0xA1&&c<=0xFE)) ) extracted += c; // is alpha else if ( option&EXTRACT_ALPHA && isalpha(c) ) continue; // is digit else if ( option&EXTRACT_DIGIT && isdigit(c) ) continue; // is punct else if ( option&EXTRACT_PUNCT && (ispunct(c)||is_punctuation(c)) ) continue; // is space else if ( option&EXTRACT_SPACE && (isspace(c)||isblank(c)) ) continue; // other else extracted += c; // enough if ( len>0 && extracted.length()>=len ) break; } return extracted; }
void file_symbol( FILE * h, char * sptr ) { int c; fprintf(h,"_"); while ((c = *(sptr++)) != 0) { if ( is_punctuation( c ) ) fprintf(h,"_"); else fprintf(h,"%c",c); } fprintf(h,"_"); return; }
BOOL is_valid_nonalphabetic_char( char ch, unsigned short input_type) { if( (inputtype_numeric & input_type) && is_numeric( ch)) return TRUE; if( (inputtype_white_spaces& input_type) && is_white_space( ch)) return TRUE; if( (inputtype_punctuation & input_type) && is_punctuation( ch)) return TRUE; if( (inputtype_symbols & input_type) && is_symbol( ch)) return TRUE; return FALSE; }
int process( char * struct_name ) { int status; int c; int l; char occi_header_filename[512]; int holder; char filter_filename[512]; char tn[512]; char token[512]; FILE * sh; sprintf(occi_header_filename,"%s.h",struct_name); sprintf(filter_filename, "%s_occi_filter.h", struct_name); sprintf(tn,"%s.c", struct_name); if (!( sh = fopen( occi_header_filename, "r" ))) return( failure(40,"file not found",occi_header_filename) ); else if (!( C.target = fopen( tn, "w" ))) { fclose(sh); return( failure(46,"creating file",tn) ); } else { holder = check_cool_cosacs( struct_name ); file_header( C.target, tn, occi_header_filename, filter_filename); while ((c = remove_white_space( sh ))) { if ( is_punctuation(c) ) { if (!(status = handle_punctuation(sh,getch(sh)))) continue; else break; } else if (!( l = get_token(sh,token,512) )) break; else if (!( status = handle_token(sh,token) )) continue; else break; } file_footer( C.target, tn ); fclose(C.target); fclose(sh); C.gensql = holder; } return( 0 ); }
int get_token( FILE * h, char * token, int tlen ) { int c; int i=0; int quoting=0; while ( i < tlen ) { if (!(c = getch(h))) break; else if ( c == '"' ) { quoting = c; continue; } else if ( c == quoting ) { quoting = 0; break; } else if ( quoting ) { *(token+i) = c; i++; continue; } else if ( is_white( c ) ) { ungetch( c ); break; } else if ( is_punctuation( c ) ) { ungetch( c ); break; } else { *(token+i) = c; i++; continue; } } *(token+i) = 0; return(i); }
int schema( char * nptr ) { int status; int c; int l; char sn[512]; char tn[512]; char token[512]; FILE * sh; sprintf(sn,"%s.h",nptr); sprintf(tn,"%s.xsd",nptr); if (!( sh = fopen( sn, "r" ))) return( failure(40,"file not found",sn) ); else if (!( C.target = fopen( tn, "w" ))) { fclose(sh); return( failure(46,"creating file",tn) ); } else { schema_header( C.target, tn, sn ); while ((c = remove_white_space( sh ))) { if ( is_punctuation(c) ) { if (!(status = handle_punctuation(sh,getch(sh)))) continue; else break; } else if (!( l = get_token(sh,token,512) )) break; else if (!( status = handle_token(sh,token) )) continue; else break; } schema_footer( C.target, tn ); fclose(C.target); fclose(sh); } return( 0 ); }
inline bool is_french_punctuation (register char c) { return is_punctuation (c) || (c=='\23') || (c=='\24'); }