Beispiel #1
0
char *
parser_get_next_word(char **sp)
{
	static char buffer[512];
	char	*s = *sp, *p = buffer;

	while (is_separator(*s))
		++s;

	if (*s == '\0')
		goto done;

	if (is_punctuation(*s)) {
		*p++ = *s++;
		goto done;
	}

	while (*s && !is_separator(*s) && !is_punctuation(*s))
		*p++ = *s++;

done:
	*p++ = '\0';
	*sp = s;
	return buffer[0]? buffer : NULL;
}
Beispiel #2
0
text_property
ucs_text_language_rep::advance (tree t, int& pos) {
  //TODO: replace methods is_punctuation (), is_iso_alpha () and is_numeric (),
  //      by equivalents taking into account unicode entities.
  string s= t->label;
  if (pos == N(s)) return &tp_normal_rep;

  if (s[pos]==' ') {
    pos++;
    // while ((pos<N(s)) && (s[pos]==' ')) pos++;
    if ((pos == N(s)) || (!is_punctuation (s[pos])))
      return &tp_space_rep;
    return &tp_blank_rep;
  }
  
  if (is_punctuation (s[pos])) {
    while ((pos<N(s)) && is_punctuation (s[pos])) pos++;
    if ((pos==N(s)) || (s[pos]!=' ')) return &tp_normal_rep;
    switch (s[pos-1]) {
    case ',': case ':': case ';': case '`': case '\'':
      return &tp_space_rep;
    case '.': case '!': case '?':
      return &tp_period_rep;
    }
    return &tp_space_rep;
  }

  if (s[pos]=='-') {
    pos++;
    while ((pos<N(s)) && (s[pos]=='-')) pos++;
    return &tp_hyph_rep;
  }

  if (is_iso_alpha (s[pos]) || (s[pos]=='<')) {
    while ((pos<N(s)) && (is_iso_alpha (s[pos]) || (s[pos]=='<'))) {
      if (s[pos]=='<') {
        while ((pos<N(s)) && (s[pos]!='>')) pos++;
        if (pos<N(s)) pos++;
      }
      else
        pos++;
    }
    return &tp_normal_rep;
  }

  if (is_numeric (s[pos])) { // can not be a '.'
    while ((pos<N(s)) && is_numeric (s[pos])) pos++;
    while (s[pos-1]=='.') pos--;
    return &tp_normal_rep;
  }

  pos++;
  return &tp_normal_rep;
}
Beispiel #3
0
text_property
text_language_rep::advance (tree t, int& pos) {
  string s= t->label;
  if (pos == N(s)) return &tp_normal_rep;

  if (s[pos]==' ') {
    pos++;
    // while ((pos<N(s)) && (s[pos]==' ')) pos++;
    if ((pos == N(s)) || (!is_punctuation (s[pos])))
      return &tp_space_rep;
    return &tp_blank_rep;
  }
  
  if (is_punctuation (s[pos])) {
    while ((pos<N(s)) && is_punctuation (s[pos])) pos++;
    if ((pos==N(s)) || (s[pos]!=' ')) return &tp_normal_rep;
    switch (s[pos-1]) {
    case ',': case ':': case ';': case '`': case '\'':
      return &tp_space_rep;
    case '.': case '!': case '?':
      return &tp_period_rep;
    }
    return &tp_space_rep;
  }

  if (s[pos]=='-') {
    pos++;
    while ((pos<N(s)) && (s[pos]=='-')) pos++;
    return &tp_hyph_rep;
  }

  if (is_iso_alpha (s[pos])) {
    while ((pos<N(s)) && is_iso_alpha (s[pos])) pos++;
    return &tp_normal_rep;
  }

  if (is_numeric (s[pos])) { // can not be a '.'
    while ((pos<N(s)) && is_numeric (s[pos])) pos++;
    while (s[pos-1]=='.') pos--;
    return &tp_normal_rep;
  }

  if (s[pos]=='<') {
    while ((pos<N(s)) && (s[pos]!='>')) pos++;
    if (pos<N(s)) pos++;
    return &tp_normal_rep;
  }

  pos++;
  return &tp_normal_rep;
}
Beispiel #4
0
/**
 * store the results to words
 * return the number of words found in dict
 */
int
get_next_words(char *words[MAX_NEXT_WORDS])
{
	int n = 0;	/* number of words found */
	off_t pos = g_pos;

	off_t end = g_pos + MAX_WORD_LENGTH + 1;
	if (end > g_len)
		end = g_len;
	
	while (pos < end && (g_text[pos] & 0x80)) {
		if (is_punctuation(g_text + pos)) 
			break;
		
		pos += 2;
		if ((words[n] = dict_get_word(g_text + g_pos, pos - g_pos))) {
			n++;
			if (n == MAX_NEXT_WORDS) {
				DEBUG("MAX_NEXT_WORDS reach\n");
				DEBUG1("%s\n", g_text + g_pos);
				int i;
				for (i = 0; i<n; i++)
					DEBUG1("%s ", words[i]);
				break;
			}
		}
	}	
	return n;
}
Beispiel #5
0
      Iterator skip_pre_terminals(const hypergraph_type& graph, const Tails& tails, Iterator first, Iterator iter, Iterator last) const
      {
	for (/**/; iter != first; -- iter) {
	  const symbol_type& cat = *(iter - 1);
	  
	  if (! is_punctuation(cat.non_terminal())) return iter - 1;
	  
	  // cat is punctuation...
	  int pos = cat.non_terminal_index() - 1;
	  if (pos < 0)
	    pos = (iter - 1) - first;
	  
	  if (pos >= static_cast<int>(tails.size()))
	    throw std::runtime_error("invalid tails");
	  
	  if (graph.nodes[tails[pos]].edges.empty())
	    throw std::runtime_error("no edges");
	  
	  const edge_type& edge = graph.edges[graph.nodes[tails[pos]].edges.front()];
	  
	  // we have tail, meaning that cat is NOT pre-terminal
	  if (! edge.tails.empty()) return iter - 1;
	}
	
	return last;
      }
Beispiel #6
0
static int
breaking_force (char c) {
  if (c == ' ') return 3;
  if (is_punctuation (c)) return 2;
  if (is_iso_alpha (c) || is_digit (c)) return 0;
  return 1;
}
Beispiel #7
0
int         parse_match(char *str, char *pattern)
{
    int i;
    int j;
    
    i = 0;
    j = 0;
    while (str[i] != 0 && pattern[j] != 0)
    {
        if (pattern[j] == 'a' && (str[i] > 122 || str[i] < 97))
            return (0);
        else if (pattern[j] == 'A' && (str[i] > 90 || str[i] < 65))
            return (0);
        else if (pattern[j] == 'i' && (str[i] > 57 || str[i] < 48))
            return (0);
        else if (pattern[j] == '.' && !is_punctuation(pattern[i]))
            return (0);
        else if (pattern[j] == '*' && !parse_match(str + i, pattern + j + 1))
            ++i;
        else if (pattern[j] != 'a' && pattern[j] != 'A' && pattern[j] != 'i' &&
            pattern[j] != '.' && pattern[j] != '*')
            return (0);
        else
        {
            ++i;
            ++j;
        }
    }
    return (pattern[j] == 0 ? 1 : 0);
}
Beispiel #8
0
/// \ingroup waUtility 
/// \fn string extract_text( const string &text, const int option, const size_t len )
/// 全角半角字符转换并提取正文
/// \param text 源字符串
/// \param option 过滤范围选项,可选值组合有
/// - EXTRACT_ALPHA 过滤字母
/// - EXTRACT_DIGIT 过滤数字
/// - EXTRACT_PUNCT 过滤标点
/// - EXTRACT_SPACE 过滤空白
/// - EXTRACT_HTML 过滤HTML代码
/// - 默认值为EXTRACT_ALL即以上全部
/// \param len 过滤长度,大于0时只截取前len个有效字符,默认为0
/// \return 转换提取结果字符串,若源字符串内容被全部过滤则返回空
string extract_text( const string &text, const int option, const size_t len ) {
	if ( text=="" || option<=0 )
		return text;
	
	string converted = sbc_to_dbc( text );
	
	// is HTML
	if ( option&EXTRACT_HTML )
		converted = extract_html( converted );
	if ( option == EXTRACT_HTML )
		return converted;

	string extracted;
	extracted.reserve( text.length() );
	
	for ( unsigned int i=0; i<converted.length(); ++i ) {
		unsigned char c = converted[i];
		if ( isalpha(c) )
			c = tolower( c );
		
		// is GBK char
		if ( !is_punctuation(c) && !isalpha(c) && 
			 ((c>=0x81&&c<=0xFE) || (c>=0x40&&c<=0x7E) || (c>=0xA1&&c<=0xFE)) )
			extracted += c;
		// is alpha
		else if ( option&EXTRACT_ALPHA && isalpha(c) )
			continue;
		// is digit
		else if ( option&EXTRACT_DIGIT && isdigit(c) )
			continue;
		// is punct
		else if ( option&EXTRACT_PUNCT && (ispunct(c)||is_punctuation(c)) )
			continue;
		// is space
		else if ( option&EXTRACT_SPACE && (isspace(c)||isblank(c)) )
			continue;
		// other 
		else
			extracted += c;
		
		// enough
		if ( len>0 && extracted.length()>=len )
			break;
	}
	
	return extracted;
}
Beispiel #9
0
void	file_symbol( FILE * h, char * sptr )
{
	int	c;
	fprintf(h,"_");
	while ((c = *(sptr++)) != 0)
	{
		if ( is_punctuation( c ) )
			fprintf(h,"_");
		else	fprintf(h,"%c",c);
	}
	fprintf(h,"_");
	return;
}
Beispiel #10
0
BOOL is_valid_nonalphabetic_char( char ch, unsigned short input_type)
{
   if( (inputtype_numeric     & input_type) && is_numeric( ch))
      return TRUE;

   if( (inputtype_white_spaces& input_type) && is_white_space( ch))
      return TRUE;

   if( (inputtype_punctuation & input_type) && is_punctuation( ch))
      return TRUE;
      
   if( (inputtype_symbols     & input_type) && is_symbol( ch))
      return TRUE;
      
   return FALSE;
}      
Beispiel #11
0
int	process( char * struct_name )
{
	int	status;
	int	c;
	int	l;
	char	occi_header_filename[512];
	int	holder;
	char	filter_filename[512];
	char	tn[512];
	char 	token[512];
	FILE * sh;
	sprintf(occi_header_filename,"%s.h",struct_name);
	sprintf(filter_filename, "%s_occi_filter.h", struct_name);
	sprintf(tn,"%s.c", struct_name);
	if (!( sh = fopen( occi_header_filename, "r" )))
		return( failure(40,"file not found",occi_header_filename) );
	else if (!( C.target = fopen( tn, "w" )))
	{
		fclose(sh);
		return( failure(46,"creating file",tn) );
	}
	else
	{
		holder = check_cool_cosacs( struct_name );
		file_header( C.target, tn, occi_header_filename, filter_filename);
		while ((c = remove_white_space( sh )))
		{
			if ( is_punctuation(c) )
			{
				if (!(status = handle_punctuation(sh,getch(sh))))
					continue;
				else	break;
			}
			else if (!( l = get_token(sh,token,512) ))
				break;
			else if (!( status = handle_token(sh,token) ))
				continue;
			else	break;
		}

		file_footer( C.target, tn );
		fclose(C.target);
		fclose(sh);
		C.gensql = holder;
	}		
	return( 0 );
}
Beispiel #12
0
int	get_token( FILE * h, char * token, int tlen )
{
	int	c;
	int	i=0;
	int	quoting=0;
	while ( i < tlen )
	{
		if (!(c = getch(h)))
			break;
		else if ( c == '"' )
		{
			quoting = c;
			continue;
		}
		else if ( c == quoting ) 
		{
			quoting = 0;
			break;
		}
		else if ( quoting )
		{
			*(token+i) = c;
			i++;
			continue;
		}
		else if ( is_white( c ) )
		{
			ungetch( c );
			break;
		}
		else if ( is_punctuation( c ) )
		{
			ungetch( c );
			break;
		}
		else
		{
			*(token+i) = c;
			i++;
			continue;
		}
	}
	*(token+i) = 0;
	return(i);
}
Beispiel #13
0
int	schema( char * nptr )
{
	int	status;
	int	c;
	int	l;
	char	sn[512];
	char	tn[512];
	char 	token[512];
	FILE * sh;
	sprintf(sn,"%s.h",nptr);
	sprintf(tn,"%s.xsd",nptr);
	if (!( sh = fopen( sn, "r" )))
		return( failure(40,"file not found",sn) );
	else if (!( C.target = fopen( tn, "w" )))
	{
		fclose(sh);
		return( failure(46,"creating file",tn) );
	}
	else
	{
		schema_header( C.target, tn, sn );
		while ((c = remove_white_space( sh )))
		{
			if ( is_punctuation(c) )
			{
				if (!(status = handle_punctuation(sh,getch(sh))))
					continue;
				else	break;
			}
			else if (!( l = get_token(sh,token,512) ))
				break;
			else if (!( status = handle_token(sh,token) ))
				continue;
			else	break;
		}

		schema_footer( C.target, tn );
		fclose(C.target);
		fclose(sh);
	}		
	return( 0 );
}
Beispiel #14
0
inline bool
is_french_punctuation (register char c) {
  return is_punctuation (c) || (c=='\23') || (c=='\24');
}