Пример #1
0
CharClass
WordSplitState::ClassifyCharacter(PRInt32 aIndex, PRBool aRecurse) const
{
  NS_ASSERTION(aIndex >= 0 && aIndex <= PRInt32(mDOMWordText.Length()),
               "Index out of range");
  if (aIndex == PRInt32(mDOMWordText.Length()))
    return CHAR_CLASS_SEPARATOR;

  // this will classify the character, we want to treat "ignorable" characters
  // such as soft hyphens as word characters.
  nsIUGenCategory::nsUGenCategory
    charCategory = mWordUtil->GetCategories()->Get(PRUint32(mDOMWordText[aIndex]));
  if (charCategory == nsIUGenCategory::kLetter ||
      IsIgnorableCharacter(mDOMWordText[aIndex]))
    return CHAR_CLASS_WORD;

  // If conditional punctuation is surrounded immediately on both sides by word
  // characters it also counts as a word character.
  if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
    if (!aRecurse) {
      // not allowed to look around, this punctuation counts like a separator
      return CHAR_CLASS_SEPARATOR;
    }

    // check the left-hand character
    if (aIndex == 0)
      return CHAR_CLASS_SEPARATOR;
    if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
      return CHAR_CLASS_SEPARATOR;

    // now we know left char is a word-char, check the right-hand character
    if (aIndex == PRInt32(mDOMWordText.Length()) - 1)
      return CHAR_CLASS_SEPARATOR;
    if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
      return CHAR_CLASS_SEPARATOR;

    // char on either side is a word, this counts as a word
    return CHAR_CLASS_WORD;
  }

  // all other punctuation
  if (charCategory == nsIUGenCategory::kSeparator ||
      charCategory == nsIUGenCategory::kOther ||
      charCategory == nsIUGenCategory::kPunctuation ||
      charCategory == nsIUGenCategory::kSymbol)
    return CHAR_CLASS_SEPARATOR;

  // any other character counts as a word
  return CHAR_CLASS_WORD;
}
Пример #2
0
void
WordSplitState::Advance()
{
  NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
  NS_ASSERTION(mDOMWordOffset < (PRInt32)mDOMWordText.Length(),
               "Length beyond end");

  mDOMWordOffset ++;
  if (mDOMWordOffset >= (PRInt32)mDOMWordText.Length())
    mCurCharClass = CHAR_CLASS_END_OF_INPUT;
  else
    mCurCharClass = ClassifyCharacter(mDOMWordOffset, PR_TRUE);
}
Пример #3
0
PRInt32
WordSplitState::FindSpecialWord()
{
  PRInt32 i;

  // Search for email addresses. We simply define these as any sequence of
  // characters with an '@' character in the middle. The DOM word is already
  // split on whitepace, so we know that everything to the end is the address
  //
  // Also look for periods, this tells us if we want to run the URL finder.
  PRBool foundDot = PR_FALSE;
  PRInt32 firstColon = -1;
  for (i = mDOMWordOffset;
       i < PRInt32(mDOMWordText.Length()); i ++) {
    if (mDOMWordText[i] == '@') {
      // only accept this if there are unambiguous word characters (don't bother
      // recursing to disambiguate apostrophes) on each side. This prevents
      // classifying, e.g. "@home" as an email address

      // Use this condition to only accept words with '@' in the middle of
      // them. It works, but the inlinespellcker doesn't like this. The problem
      // is that you type "fhsgfh@" that's a misspelled word followed by a
      // symbol, but when you type another letter "fhsgfh@g" that first word
      // need to be unmarked misspelled. It doesn't do this. it only checks the
      // current position for potentially removing a spelling range.
      if (i > 0 && ClassifyCharacter(i - 1, PR_FALSE) == CHAR_CLASS_WORD &&
          i < (PRInt32)mDOMWordText.Length() - 1 &&
          ClassifyCharacter(i + 1, PR_FALSE) == CHAR_CLASS_WORD)

      return mDOMWordText.Length() - mDOMWordOffset;
    } else if (mDOMWordText[i] == '.' && ! foundDot &&
        i > 0 && i < (PRInt32)mDOMWordText.Length() - 1) {
      // we found a period not at the end, we should check harder for URLs
      foundDot = PR_TRUE;
    } else if (mDOMWordText[i] == ':' && firstColon < 0) {
      firstColon = i;
    }
  }

  // If the first colon is followed by a slash, consider it a URL
  // This will catch things like asdf://foo.com
  if (firstColon >= 0 && firstColon < (PRInt32)mDOMWordText.Length() - 1 &&
      mDOMWordText[firstColon + 1] == '/') {
    return mDOMWordText.Length() - mDOMWordOffset;
  }

  // Check the text before the first colon against some known protocols. It
  // is impossible to check against all protocols, especially since you can
  // plug in new protocols. We also don't want to waste time here checking
  // against a lot of obscure protocols.
  if (firstColon > mDOMWordOffset) {
    nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
                      firstColon - mDOMWordOffset));
    if (protocol.EqualsIgnoreCase("http") ||
        protocol.EqualsIgnoreCase("https") ||
        protocol.EqualsIgnoreCase("news") ||
        protocol.EqualsIgnoreCase("ftp") ||
        protocol.EqualsIgnoreCase("file") ||
        protocol.EqualsIgnoreCase("javascript") ||
        protocol.EqualsIgnoreCase("ftp")) {
      return mDOMWordText.Length() - mDOMWordOffset;
    }
  }

  // not anything special
  return -1;
}
Пример #4
0
/* class static */ void
XMPUtils::SeparateArrayItems ( XMPMeta *	  xmpObj,
							   XMP_StringPtr  schemaNS,
							   XMP_StringPtr  arrayName,
							   XMP_OptionBits options,
							   XMP_StringPtr  catedStr )
{
	XMP_Assert ( (schemaNS != 0) && (arrayName != 0) && (catedStr != 0) );	// ! Enforced by wrapper.
	
	XMP_VarString itemValue;
	size_t itemStart, itemEnd;
	size_t nextSize, charSize = 0;	// Avoid VS uninit var warnings.
	UniCharKind	  nextKind, charKind = UCK_normal;
	UniCodePoint  nextChar, uniChar = 0;
	
	// Extract "special" option bits, verify and normalize the others.
	
	bool preserveCommas = false;
	if ( options & kXMPUtil_AllowCommas ) {
		preserveCommas = true;
		options ^= kXMPUtil_AllowCommas;
	}

	options = VerifySetOptions ( options, 0 );	// Keep a zero value, has special meaning below.
	if ( options & ~kXMP_PropArrayFormMask ) XMP_Throw ( "Options can only provide array form", kXMPErr_BadOptions );
	
	// Find the array node, make sure it is OK. Move the current children aside, to be readded later if kept.
	
	XMP_ExpandedXPath arrayPath;
	ExpandXPath ( schemaNS, arrayName, &arrayPath );
	XMP_Node * arrayNode = FindNode ( &xmpObj->tree, arrayPath, kXMP_ExistingOnly );
	
	if ( arrayNode != 0 ) {
		// The array exists, make sure the form is compatible. Zero arrayForm means take what exists.
		XMP_OptionBits arrayForm = arrayNode->options & kXMP_PropArrayFormMask;
		if ( (arrayForm == 0) || (arrayForm & kXMP_PropArrayIsAlternate) ) {
			XMP_Throw ( "Named property must be non-alternate array", kXMPErr_BadXPath );
		}
		if ( (options != 0) && (options != arrayForm) ) XMP_Throw ( "Mismatch of specified and existing array form", kXMPErr_BadXPath );	// *** Right error?
	} else {
		// The array does not exist, try to create it.
		arrayNode = FindNode ( &xmpObj->tree, arrayPath, kXMP_CreateNodes, (options | kXMP_PropValueIsArray) );
		if ( arrayNode == 0 ) XMP_Throw ( "Failed to create named array", kXMPErr_BadXPath );
	}

	XMP_NodeOffspring oldChildren ( arrayNode->children );
	size_t oldChildCount = oldChildren.size();
	arrayNode->children.clear();
	
	// Extract the item values one at a time, until the whole input string is done. Be very careful
	// in the extraction about the string positions. They are essentially byte pointers, while the
	// contents are UTF-8. Adding or subtracting 1 does not necessarily move 1 Unicode character!
	
	size_t endPos = strlen ( catedStr );
	
	itemEnd = 0;
	while ( itemEnd < endPos ) {
		
		// Skip any leading spaces and separation characters. Always skip commas here. They can be
		// kept when within a value, but not when alone between values.
		
		for ( itemStart = itemEnd; itemStart < endPos; itemStart += charSize ) {
			ClassifyCharacter ( catedStr, itemStart, &charKind, &charSize, &uniChar );
			if ( (charKind == UCK_normal) || (charKind == UCK_quote) ) break;
		}
		if ( itemStart >= endPos ) break;
		
		if ( charKind != UCK_quote ) {
		
			// This is not a quoted value. Scan for the end, create an array item from the substring.

			for ( itemEnd = itemStart; itemEnd < endPos; itemEnd += charSize ) {

				ClassifyCharacter ( catedStr, itemEnd, &charKind, &charSize, &uniChar );

				if ( (charKind == UCK_normal) || (charKind == UCK_quote) ) continue;
				if ( (charKind == UCK_comma) && preserveCommas ) continue;
				if ( charKind != UCK_space ) break;

				if ( (itemEnd + charSize) >= endPos ) break;	// Anything left?
				ClassifyCharacter ( catedStr, (itemEnd+charSize), &nextKind, &nextSize, &nextChar );
				if ( (nextKind == UCK_normal) || (nextKind == UCK_quote) ) continue;
				if ( (nextKind == UCK_comma) && preserveCommas ) continue;
				break;	// Have multiple spaces, or a space followed by a separator.

			}		

			itemValue.assign ( catedStr, itemStart, (itemEnd - itemStart) );
		
		} else {
		
			// Accumulate quoted values into a local string, undoubling internal quotes that
			// match the surrounding quotes. Do not undouble "unmatching" quotes.
		
			UniCodePoint openQuote = uniChar;
			UniCodePoint closeQuote = GetClosingQuote ( openQuote );

			itemStart += charSize;	// Skip the opening quote;
			itemValue.erase();
			
			for ( itemEnd = itemStart; itemEnd < endPos; itemEnd += charSize ) {

				ClassifyCharacter ( catedStr, itemEnd, &charKind, &charSize, &uniChar );

				if ( (charKind != UCK_quote) || (! IsSurroundingQuote ( uniChar, openQuote, closeQuote)) ) {
				
					// This is not a matching quote, just append it to the item value.
					itemValue.append ( catedStr, itemEnd, charSize );
					
				} else {
				
					// This is a "matching" quote. Is it doubled, or the final closing quote? Tolerate
					// various edge cases like undoubled opening (non-closing) quotes, or end of input.
					
					if ( (itemEnd + charSize) < endPos ) {
						ClassifyCharacter ( catedStr, itemEnd+charSize, &nextKind, &nextSize, &nextChar );
					} else {
						nextKind = UCK_semicolon; nextSize = 0; nextChar = 0x3B;
					}
					
					if ( uniChar == nextChar ) {
						// This is doubled, copy it and skip the double.
						itemValue.append ( catedStr, itemEnd, charSize );
						itemEnd += nextSize;	// Loop will add in charSize.
					} else if ( ! IsClosingingQuote ( uniChar, openQuote, closeQuote ) ) {
						// This is an undoubled, non-closing quote, copy it.
						itemValue.append ( catedStr, itemEnd, charSize );
					} else {
						// This is an undoubled closing quote, skip it and exit the loop.
						itemEnd += charSize;
						break;
					}

				}

			}	// Loop to accumulate the quoted value.
		
		}

		// Add the separated item to the array. Keep a matching old value in case it had separators.
		
		size_t oldChild;
		for ( oldChild = 0; oldChild < oldChildCount; ++oldChild ) {
			if ( (oldChildren[oldChild] != 0) && (itemValue == oldChildren[oldChild]->value) ) break;
		}
		
		XMP_Node * newItem = 0;
		if ( oldChild == oldChildCount ) {
			newItem = new XMP_Node ( arrayNode, kXMP_ArrayItemName, itemValue.c_str(), 0 );
		} else {
			newItem = oldChildren[oldChild];
			oldChildren[oldChild] = 0;	// ! Don't match again, let duplicates be seen.
		}
		arrayNode->children.push_back ( newItem );
		
	}	// Loop through all of the returned items.

	// Delete any of the old children that were not kept.
	for ( size_t i = 0; i < oldChildCount; ++i ) {
		if ( oldChildren[i] != 0 ) delete oldChildren[i];
	}
	
}	// SeparateArrayItems
Пример #5
0
/* class static */ void
XMPUtils::CatenateArrayItems ( const XMPMeta & xmpObj,
							   XMP_StringPtr   schemaNS,
							   XMP_StringPtr   arrayName,
							   XMP_StringPtr   separator,
							   XMP_StringPtr   quotes,
							   XMP_OptionBits  options,
							   XMP_StringPtr * catedStr,
							   XMP_StringLen * catedLen )
{
	XMP_Assert ( (schemaNS != 0) && (arrayName != 0) ); // ! Enforced by wrapper.
	XMP_Assert ( (separator != 0) && (quotes != 0) && (catedStr != 0) && (catedLen != 0) ); // ! Enforced by wrapper.
	
	size_t		 strLen, strPos, charLen;
	UniCharKind	 charKind;
	UniCodePoint currUCP, openQuote, closeQuote;
	
	const bool allowCommas = ((options & kXMPUtil_AllowCommas) != 0);
	
	const XMP_Node * arrayNode = 0; // ! Move up to avoid gcc complaints.
	XMP_OptionBits	 arrayForm = 0;
	const XMP_Node * currItem  = 0;

	// Make sure the separator is OK. It must be one semicolon surrounded by zero or more spaces.
	// Any of the recognized semicolons or spaces are allowed.
	
	strPos = 0;
	strLen = strlen ( separator );
	bool haveSemicolon = false;
	
	while ( strPos < strLen ) {
		ClassifyCharacter ( separator, strPos, &charKind, &charLen, &currUCP );
		strPos += charLen;
		if ( charKind == UCK_semicolon ) {
			if ( haveSemicolon ) XMP_Throw ( "Separator can have only one semicolon",  kXMPErr_BadParam );
			haveSemicolon = true;
		} else if ( charKind != UCK_space ) {
			XMP_Throw ( "Separator can have only spaces and one semicolon",	 kXMPErr_BadParam );
		}
	};
	if ( ! haveSemicolon ) XMP_Throw ( "Separator must have one semicolon",	 kXMPErr_BadParam );
	
	// Make sure the open and close quotes are a legitimate pair.

	strLen = strlen ( quotes );
	ClassifyCharacter ( quotes, 0, &charKind, &charLen, &openQuote );
	if ( charKind != UCK_quote ) XMP_Throw ( "Invalid quoting character", kXMPErr_BadParam );

	if ( charLen == strLen ) {
		closeQuote = openQuote;
	} else {
		strPos = charLen;
		ClassifyCharacter ( quotes, strPos, &charKind, &charLen, &closeQuote );
		if ( charKind != UCK_quote ) XMP_Throw ( "Invalid quoting character", kXMPErr_BadParam );
		if ( (strPos + charLen) != strLen ) XMP_Throw ( "Quoting string too long", kXMPErr_BadParam );
	}
	if ( closeQuote != GetClosingQuote ( openQuote ) ) XMP_Throw ( "Mismatched quote pair", kXMPErr_BadParam );

	// Return an empty result if the array does not exist, hurl if it isn't the right form.
	
	sCatenatedItems->erase();

	XMP_ExpandedXPath arrayPath;
	ExpandXPath ( schemaNS, arrayName, &arrayPath );

	arrayNode = FindConstNode ( &xmpObj.tree, arrayPath );
	if ( arrayNode == 0 ) goto EXIT;	// ! Need to set the output pointer and length.

	arrayForm = arrayNode->options & kXMP_PropCompositeMask;
	if ( (! (arrayForm & kXMP_PropValueIsArray)) || (arrayForm & kXMP_PropArrayIsAlternate) ) {
		XMP_Throw ( "Named property must be non-alternate array", kXMPErr_BadParam );
	}
	if ( arrayNode->children.empty() ) goto EXIT;	// ! Need to set the output pointer and length.
	
	// Build the result, quoting the array items, adding separators. Hurl if any item isn't simple.
	// Start the result with the first value, then add the rest with a preceeding separator.
	
	currItem = arrayNode->children[0];
	
	if ( (currItem->options & kXMP_PropCompositeMask) != 0 ) XMP_Throw ( "Array items must be simple", kXMPErr_BadParam );
	*sCatenatedItems = currItem->value;
	ApplyQuotes ( sCatenatedItems, openQuote, closeQuote, allowCommas );
	
	for ( size_t itemNum = 1, itemLim = arrayNode->children.size(); itemNum != itemLim; ++itemNum ) {
		const XMP_Node * currItem = arrayNode->children[itemNum];
		if ( (currItem->options & kXMP_PropCompositeMask) != 0 ) XMP_Throw ( "Array items must be simple", kXMPErr_BadParam );
		XMP_VarString tempStr ( currItem->value );
		ApplyQuotes ( &tempStr, openQuote, closeQuote, allowCommas );
		*sCatenatedItems += separator;
		*sCatenatedItems += tempStr;
	}
	
EXIT:
	*catedStr = sCatenatedItems->c_str();
	*catedLen = sCatenatedItems->size();

}	// CatenateArrayItems
Пример #6
0
static void
ApplyQuotes ( XMP_VarString * item, UniCodePoint openQuote, UniCodePoint closeQuote, bool allowCommas )
{
	bool	prevSpace	= false;
	size_t	charOffset, charLen;
	UniCharKind		charKind;
	UniCodePoint	uniChar;
	
	// -----------------------------------------------------------------------------------------
	// See if there are any separators in the value. Stop at the first occurrance. This is a bit
	// tricky in order to make typical typing work conveniently. The purpose of applying quotes
	// is to preserve the values when splitting them back apart. That is CatenateContainerItems
	// and SeparateContainerItems must round trip properly. For the most part we only look for
	// separators here. Internal quotes, as in -- Irving "Bud" Jones -- won't cause problems in
	// the separation. An initial quote will though, it will make the value look quoted.

	charOffset = 0;
	ClassifyCharacter ( item->c_str(), charOffset, &charKind, &charLen, &uniChar );
	
	if ( charKind != UCK_quote ) {
	
	for ( charOffset = 0; size_t(charOffset) < item->size(); charOffset += charLen ) {

			ClassifyCharacter ( item->c_str(), charOffset, &charKind, &charLen, &uniChar );

			if ( charKind == UCK_space ) {
				if ( prevSpace ) break; // Multiple spaces are a separator.
				prevSpace = true;
			} else {
				prevSpace = false;
				if ( (charKind == UCK_semicolon) || (charKind == UCK_control) ) break;
				if ( (charKind == UCK_comma) && (! allowCommas) ) break;
			}

		}
	
	}
	
	if ( size_t(charOffset) < item->size() ) {
	
		// --------------------------------------------------------------------------------------
		// Create a quoted copy, doubling any internal quotes that match the outer ones. Internal
		// quotes did not stop the "needs quoting" search, but they do need doubling. So we have
		// to rescan the front of the string for quotes. Handle the special case of U+301D being
		// closed by either U+301E or U+301F.
		
		XMP_VarString	newItem;
		size_t			splitPoint;
		
		for ( splitPoint = 0; splitPoint <= charOffset; ++splitPoint ) {
			ClassifyCharacter ( item->c_str(), splitPoint, &charKind, &charLen, &uniChar );
			if ( charKind == UCK_quote ) break;
		}
		
		CodePointToUTF8 ( openQuote, newItem );
		newItem.append ( *item, 0, splitPoint );	// Copy the leading "normal" portion.

		for ( charOffset = splitPoint; size_t(charOffset) < item->size(); charOffset += charLen ) {
			ClassifyCharacter ( item->c_str(), charOffset, &charKind, &charLen, &uniChar );
			newItem.append ( *item, charOffset, charLen );
			if ( (charKind == UCK_quote) && IsSurroundingQuote ( uniChar, openQuote, closeQuote ) ) {
				newItem.append ( *item, charOffset, charLen );
			}
		}
		
		XMP_VarString closeStr;
		CodePointToUTF8 ( closeQuote, closeStr );
		newItem.append ( closeStr );
		
		*item = newItem;
	
	}
	
}	// ApplyQuotes