CharClass WordSplitState::ClassifyCharacter(PRInt32 aIndex, PRBool aRecurse) const { NS_ASSERTION(aIndex >= 0 && aIndex <= PRInt32(mDOMWordText.Length()), "Index out of range"); if (aIndex == PRInt32(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR; // this will classify the character, we want to treat "ignorable" characters // such as soft hyphens as word characters. nsIUGenCategory::nsUGenCategory charCategory = mWordUtil->GetCategories()->Get(PRUint32(mDOMWordText[aIndex])); if (charCategory == nsIUGenCategory::kLetter || IsIgnorableCharacter(mDOMWordText[aIndex])) return CHAR_CLASS_WORD; // If conditional punctuation is surrounded immediately on both sides by word // characters it also counts as a word character. if (IsConditionalPunctuation(mDOMWordText[aIndex])) { if (!aRecurse) { // not allowed to look around, this punctuation counts like a separator return CHAR_CLASS_SEPARATOR; } // check the left-hand character if (aIndex == 0) return CHAR_CLASS_SEPARATOR; if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) return CHAR_CLASS_SEPARATOR; // now we know left char is a word-char, check the right-hand character if (aIndex == PRInt32(mDOMWordText.Length()) - 1) return CHAR_CLASS_SEPARATOR; if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD) return CHAR_CLASS_SEPARATOR; // char on either side is a word, this counts as a word return CHAR_CLASS_WORD; } // all other punctuation if (charCategory == nsIUGenCategory::kSeparator || charCategory == nsIUGenCategory::kOther || charCategory == nsIUGenCategory::kPunctuation || charCategory == nsIUGenCategory::kSymbol) return CHAR_CLASS_SEPARATOR; // any other character counts as a word return CHAR_CLASS_WORD; }
void WordSplitState::Advance() { NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index"); NS_ASSERTION(mDOMWordOffset < (PRInt32)mDOMWordText.Length(), "Length beyond end"); mDOMWordOffset ++; if (mDOMWordOffset >= (PRInt32)mDOMWordText.Length()) mCurCharClass = CHAR_CLASS_END_OF_INPUT; else mCurCharClass = ClassifyCharacter(mDOMWordOffset, PR_TRUE); }
PRInt32 WordSplitState::FindSpecialWord() { PRInt32 i; // Search for email addresses. We simply define these as any sequence of // characters with an '@' character in the middle. The DOM word is already // split on whitepace, so we know that everything to the end is the address // // Also look for periods, this tells us if we want to run the URL finder. PRBool foundDot = PR_FALSE; PRInt32 firstColon = -1; for (i = mDOMWordOffset; i < PRInt32(mDOMWordText.Length()); i ++) { if (mDOMWordText[i] == '@') { // only accept this if there are unambiguous word characters (don't bother // recursing to disambiguate apostrophes) on each side. This prevents // classifying, e.g. "@home" as an email address // Use this condition to only accept words with '@' in the middle of // them. It works, but the inlinespellcker doesn't like this. The problem // is that you type "fhsgfh@" that's a misspelled word followed by a // symbol, but when you type another letter "fhsgfh@g" that first word // need to be unmarked misspelled. It doesn't do this. it only checks the // current position for potentially removing a spelling range. if (i > 0 && ClassifyCharacter(i - 1, PR_FALSE) == CHAR_CLASS_WORD && i < (PRInt32)mDOMWordText.Length() - 1 && ClassifyCharacter(i + 1, PR_FALSE) == CHAR_CLASS_WORD) return mDOMWordText.Length() - mDOMWordOffset; } else if (mDOMWordText[i] == '.' && ! foundDot && i > 0 && i < (PRInt32)mDOMWordText.Length() - 1) { // we found a period not at the end, we should check harder for URLs foundDot = PR_TRUE; } else if (mDOMWordText[i] == ':' && firstColon < 0) { firstColon = i; } } // If the first colon is followed by a slash, consider it a URL // This will catch things like asdf://foo.com if (firstColon >= 0 && firstColon < (PRInt32)mDOMWordText.Length() - 1 && mDOMWordText[firstColon + 1] == '/') { return mDOMWordText.Length() - mDOMWordOffset; } // Check the text before the first colon against some known protocols. It // is impossible to check against all protocols, especially since you can // plug in new protocols. We also don't want to waste time here checking // against a lot of obscure protocols. if (firstColon > mDOMWordOffset) { nsString protocol(Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset)); if (protocol.EqualsIgnoreCase("http") || protocol.EqualsIgnoreCase("https") || protocol.EqualsIgnoreCase("news") || protocol.EqualsIgnoreCase("ftp") || protocol.EqualsIgnoreCase("file") || protocol.EqualsIgnoreCase("javascript") || protocol.EqualsIgnoreCase("ftp")) { return mDOMWordText.Length() - mDOMWordOffset; } } // not anything special return -1; }
/* class static */ void XMPUtils::SeparateArrayItems ( XMPMeta * xmpObj, XMP_StringPtr schemaNS, XMP_StringPtr arrayName, XMP_OptionBits options, XMP_StringPtr catedStr ) { XMP_Assert ( (schemaNS != 0) && (arrayName != 0) && (catedStr != 0) ); // ! Enforced by wrapper. XMP_VarString itemValue; size_t itemStart, itemEnd; size_t nextSize, charSize = 0; // Avoid VS uninit var warnings. UniCharKind nextKind, charKind = UCK_normal; UniCodePoint nextChar, uniChar = 0; // Extract "special" option bits, verify and normalize the others. bool preserveCommas = false; if ( options & kXMPUtil_AllowCommas ) { preserveCommas = true; options ^= kXMPUtil_AllowCommas; } options = VerifySetOptions ( options, 0 ); // Keep a zero value, has special meaning below. if ( options & ~kXMP_PropArrayFormMask ) XMP_Throw ( "Options can only provide array form", kXMPErr_BadOptions ); // Find the array node, make sure it is OK. Move the current children aside, to be readded later if kept. XMP_ExpandedXPath arrayPath; ExpandXPath ( schemaNS, arrayName, &arrayPath ); XMP_Node * arrayNode = FindNode ( &xmpObj->tree, arrayPath, kXMP_ExistingOnly ); if ( arrayNode != 0 ) { // The array exists, make sure the form is compatible. Zero arrayForm means take what exists. XMP_OptionBits arrayForm = arrayNode->options & kXMP_PropArrayFormMask; if ( (arrayForm == 0) || (arrayForm & kXMP_PropArrayIsAlternate) ) { XMP_Throw ( "Named property must be non-alternate array", kXMPErr_BadXPath ); } if ( (options != 0) && (options != arrayForm) ) XMP_Throw ( "Mismatch of specified and existing array form", kXMPErr_BadXPath ); // *** Right error? } else { // The array does not exist, try to create it. arrayNode = FindNode ( &xmpObj->tree, arrayPath, kXMP_CreateNodes, (options | kXMP_PropValueIsArray) ); if ( arrayNode == 0 ) XMP_Throw ( "Failed to create named array", kXMPErr_BadXPath ); } XMP_NodeOffspring oldChildren ( arrayNode->children ); size_t oldChildCount = oldChildren.size(); arrayNode->children.clear(); // Extract the item values one at a time, until the whole input string is done. Be very careful // in the extraction about the string positions. They are essentially byte pointers, while the // contents are UTF-8. Adding or subtracting 1 does not necessarily move 1 Unicode character! size_t endPos = strlen ( catedStr ); itemEnd = 0; while ( itemEnd < endPos ) { // Skip any leading spaces and separation characters. Always skip commas here. They can be // kept when within a value, but not when alone between values. for ( itemStart = itemEnd; itemStart < endPos; itemStart += charSize ) { ClassifyCharacter ( catedStr, itemStart, &charKind, &charSize, &uniChar ); if ( (charKind == UCK_normal) || (charKind == UCK_quote) ) break; } if ( itemStart >= endPos ) break; if ( charKind != UCK_quote ) { // This is not a quoted value. Scan for the end, create an array item from the substring. for ( itemEnd = itemStart; itemEnd < endPos; itemEnd += charSize ) { ClassifyCharacter ( catedStr, itemEnd, &charKind, &charSize, &uniChar ); if ( (charKind == UCK_normal) || (charKind == UCK_quote) ) continue; if ( (charKind == UCK_comma) && preserveCommas ) continue; if ( charKind != UCK_space ) break; if ( (itemEnd + charSize) >= endPos ) break; // Anything left? ClassifyCharacter ( catedStr, (itemEnd+charSize), &nextKind, &nextSize, &nextChar ); if ( (nextKind == UCK_normal) || (nextKind == UCK_quote) ) continue; if ( (nextKind == UCK_comma) && preserveCommas ) continue; break; // Have multiple spaces, or a space followed by a separator. } itemValue.assign ( catedStr, itemStart, (itemEnd - itemStart) ); } else { // Accumulate quoted values into a local string, undoubling internal quotes that // match the surrounding quotes. Do not undouble "unmatching" quotes. UniCodePoint openQuote = uniChar; UniCodePoint closeQuote = GetClosingQuote ( openQuote ); itemStart += charSize; // Skip the opening quote; itemValue.erase(); for ( itemEnd = itemStart; itemEnd < endPos; itemEnd += charSize ) { ClassifyCharacter ( catedStr, itemEnd, &charKind, &charSize, &uniChar ); if ( (charKind != UCK_quote) || (! IsSurroundingQuote ( uniChar, openQuote, closeQuote)) ) { // This is not a matching quote, just append it to the item value. itemValue.append ( catedStr, itemEnd, charSize ); } else { // This is a "matching" quote. Is it doubled, or the final closing quote? Tolerate // various edge cases like undoubled opening (non-closing) quotes, or end of input. if ( (itemEnd + charSize) < endPos ) { ClassifyCharacter ( catedStr, itemEnd+charSize, &nextKind, &nextSize, &nextChar ); } else { nextKind = UCK_semicolon; nextSize = 0; nextChar = 0x3B; } if ( uniChar == nextChar ) { // This is doubled, copy it and skip the double. itemValue.append ( catedStr, itemEnd, charSize ); itemEnd += nextSize; // Loop will add in charSize. } else if ( ! IsClosingingQuote ( uniChar, openQuote, closeQuote ) ) { // This is an undoubled, non-closing quote, copy it. itemValue.append ( catedStr, itemEnd, charSize ); } else { // This is an undoubled closing quote, skip it and exit the loop. itemEnd += charSize; break; } } } // Loop to accumulate the quoted value. } // Add the separated item to the array. Keep a matching old value in case it had separators. size_t oldChild; for ( oldChild = 0; oldChild < oldChildCount; ++oldChild ) { if ( (oldChildren[oldChild] != 0) && (itemValue == oldChildren[oldChild]->value) ) break; } XMP_Node * newItem = 0; if ( oldChild == oldChildCount ) { newItem = new XMP_Node ( arrayNode, kXMP_ArrayItemName, itemValue.c_str(), 0 ); } else { newItem = oldChildren[oldChild]; oldChildren[oldChild] = 0; // ! Don't match again, let duplicates be seen. } arrayNode->children.push_back ( newItem ); } // Loop through all of the returned items. // Delete any of the old children that were not kept. for ( size_t i = 0; i < oldChildCount; ++i ) { if ( oldChildren[i] != 0 ) delete oldChildren[i]; } } // SeparateArrayItems
/* class static */ void XMPUtils::CatenateArrayItems ( const XMPMeta & xmpObj, XMP_StringPtr schemaNS, XMP_StringPtr arrayName, XMP_StringPtr separator, XMP_StringPtr quotes, XMP_OptionBits options, XMP_StringPtr * catedStr, XMP_StringLen * catedLen ) { XMP_Assert ( (schemaNS != 0) && (arrayName != 0) ); // ! Enforced by wrapper. XMP_Assert ( (separator != 0) && (quotes != 0) && (catedStr != 0) && (catedLen != 0) ); // ! Enforced by wrapper. size_t strLen, strPos, charLen; UniCharKind charKind; UniCodePoint currUCP, openQuote, closeQuote; const bool allowCommas = ((options & kXMPUtil_AllowCommas) != 0); const XMP_Node * arrayNode = 0; // ! Move up to avoid gcc complaints. XMP_OptionBits arrayForm = 0; const XMP_Node * currItem = 0; // Make sure the separator is OK. It must be one semicolon surrounded by zero or more spaces. // Any of the recognized semicolons or spaces are allowed. strPos = 0; strLen = strlen ( separator ); bool haveSemicolon = false; while ( strPos < strLen ) { ClassifyCharacter ( separator, strPos, &charKind, &charLen, &currUCP ); strPos += charLen; if ( charKind == UCK_semicolon ) { if ( haveSemicolon ) XMP_Throw ( "Separator can have only one semicolon", kXMPErr_BadParam ); haveSemicolon = true; } else if ( charKind != UCK_space ) { XMP_Throw ( "Separator can have only spaces and one semicolon", kXMPErr_BadParam ); } }; if ( ! haveSemicolon ) XMP_Throw ( "Separator must have one semicolon", kXMPErr_BadParam ); // Make sure the open and close quotes are a legitimate pair. strLen = strlen ( quotes ); ClassifyCharacter ( quotes, 0, &charKind, &charLen, &openQuote ); if ( charKind != UCK_quote ) XMP_Throw ( "Invalid quoting character", kXMPErr_BadParam ); if ( charLen == strLen ) { closeQuote = openQuote; } else { strPos = charLen; ClassifyCharacter ( quotes, strPos, &charKind, &charLen, &closeQuote ); if ( charKind != UCK_quote ) XMP_Throw ( "Invalid quoting character", kXMPErr_BadParam ); if ( (strPos + charLen) != strLen ) XMP_Throw ( "Quoting string too long", kXMPErr_BadParam ); } if ( closeQuote != GetClosingQuote ( openQuote ) ) XMP_Throw ( "Mismatched quote pair", kXMPErr_BadParam ); // Return an empty result if the array does not exist, hurl if it isn't the right form. sCatenatedItems->erase(); XMP_ExpandedXPath arrayPath; ExpandXPath ( schemaNS, arrayName, &arrayPath ); arrayNode = FindConstNode ( &xmpObj.tree, arrayPath ); if ( arrayNode == 0 ) goto EXIT; // ! Need to set the output pointer and length. arrayForm = arrayNode->options & kXMP_PropCompositeMask; if ( (! (arrayForm & kXMP_PropValueIsArray)) || (arrayForm & kXMP_PropArrayIsAlternate) ) { XMP_Throw ( "Named property must be non-alternate array", kXMPErr_BadParam ); } if ( arrayNode->children.empty() ) goto EXIT; // ! Need to set the output pointer and length. // Build the result, quoting the array items, adding separators. Hurl if any item isn't simple. // Start the result with the first value, then add the rest with a preceeding separator. currItem = arrayNode->children[0]; if ( (currItem->options & kXMP_PropCompositeMask) != 0 ) XMP_Throw ( "Array items must be simple", kXMPErr_BadParam ); *sCatenatedItems = currItem->value; ApplyQuotes ( sCatenatedItems, openQuote, closeQuote, allowCommas ); for ( size_t itemNum = 1, itemLim = arrayNode->children.size(); itemNum != itemLim; ++itemNum ) { const XMP_Node * currItem = arrayNode->children[itemNum]; if ( (currItem->options & kXMP_PropCompositeMask) != 0 ) XMP_Throw ( "Array items must be simple", kXMPErr_BadParam ); XMP_VarString tempStr ( currItem->value ); ApplyQuotes ( &tempStr, openQuote, closeQuote, allowCommas ); *sCatenatedItems += separator; *sCatenatedItems += tempStr; } EXIT: *catedStr = sCatenatedItems->c_str(); *catedLen = sCatenatedItems->size(); } // CatenateArrayItems
static void ApplyQuotes ( XMP_VarString * item, UniCodePoint openQuote, UniCodePoint closeQuote, bool allowCommas ) { bool prevSpace = false; size_t charOffset, charLen; UniCharKind charKind; UniCodePoint uniChar; // ----------------------------------------------------------------------------------------- // See if there are any separators in the value. Stop at the first occurrance. This is a bit // tricky in order to make typical typing work conveniently. The purpose of applying quotes // is to preserve the values when splitting them back apart. That is CatenateContainerItems // and SeparateContainerItems must round trip properly. For the most part we only look for // separators here. Internal quotes, as in -- Irving "Bud" Jones -- won't cause problems in // the separation. An initial quote will though, it will make the value look quoted. charOffset = 0; ClassifyCharacter ( item->c_str(), charOffset, &charKind, &charLen, &uniChar ); if ( charKind != UCK_quote ) { for ( charOffset = 0; size_t(charOffset) < item->size(); charOffset += charLen ) { ClassifyCharacter ( item->c_str(), charOffset, &charKind, &charLen, &uniChar ); if ( charKind == UCK_space ) { if ( prevSpace ) break; // Multiple spaces are a separator. prevSpace = true; } else { prevSpace = false; if ( (charKind == UCK_semicolon) || (charKind == UCK_control) ) break; if ( (charKind == UCK_comma) && (! allowCommas) ) break; } } } if ( size_t(charOffset) < item->size() ) { // -------------------------------------------------------------------------------------- // Create a quoted copy, doubling any internal quotes that match the outer ones. Internal // quotes did not stop the "needs quoting" search, but they do need doubling. So we have // to rescan the front of the string for quotes. Handle the special case of U+301D being // closed by either U+301E or U+301F. XMP_VarString newItem; size_t splitPoint; for ( splitPoint = 0; splitPoint <= charOffset; ++splitPoint ) { ClassifyCharacter ( item->c_str(), splitPoint, &charKind, &charLen, &uniChar ); if ( charKind == UCK_quote ) break; } CodePointToUTF8 ( openQuote, newItem ); newItem.append ( *item, 0, splitPoint ); // Copy the leading "normal" portion. for ( charOffset = splitPoint; size_t(charOffset) < item->size(); charOffset += charLen ) { ClassifyCharacter ( item->c_str(), charOffset, &charKind, &charLen, &uniChar ); newItem.append ( *item, charOffset, charLen ); if ( (charKind == UCK_quote) && IsSurroundingQuote ( uniChar, openQuote, closeQuote ) ) { newItem.append ( *item, charOffset, charLen ); } } XMP_VarString closeStr; CodePointToUTF8 ( closeQuote, closeStr ); newItem.append ( closeStr ); *item = newItem; } } // ApplyQuotes