Пример #1
0
  // Look for "EnglishDocument" sofa and read it as a stream
  TyErrorId process(CAS & rCas, ResultSpecification const & crResultSpecification) {
    cout << "SofaDataAnnotator: process() begins" << endl;

    /** get the CAS view of the sofa */
    CAS * tcas = rCas.getView("EnglishDocument");
    /** get the handle to the index repository */
    FSIndexRepository & indexRep = tcas->getIndexRepository();

    /** get the default text sofa */
    SofaFS textSofa = tcas->getSofa();

    /** get the handle to the sofa data stream */
    SofaDataStream * pStream = textSofa.getSofaDataStream();
    /** open the stream */
    int rc = pStream->open();
    if (rc != 0) {
      cout << "open failed "  << rc << endl;
      return (TyErrorId)UIMA_ERR_USER_ANNOTATOR_COULD_NOT_PROCESS;
    }
    /** get the total stream size */
    size_t streamSize = pStream->getTotalStreamSizeInBytes();

    /** read file contents into a buffer */
    char * pBuffer = new char[streamSize+1];
    memset(pBuffer,'\n' ,streamSize+1);
    int elementsize=1;
    pStream->read(pBuffer, elementsize, streamSize);

    cout << endl;
    cout.write(pBuffer, streamSize);
    cout << endl;

    /** convert to unicode */
    UnicodeString ustrInputText(pBuffer, streamSize+1, "utf-8");

    /** find tokens and annotate */
    UnicodeString delim(" ");
    UChar *myLocalSaveState;
    UChar * pInputText = (UChar*) ustrInputText.getBuffer();
    const UChar * pToken = pInputText;
    const UChar * pNextToken = u_strtok_r((UChar*) pInputText, delim.getBuffer(), &myLocalSaveState);
    int start = 1;
    int tokenlength=0;
    int nTokens = 0;
    while ( (pNextToken=u_strtok_r(NULL, delim.getBuffer(), &myLocalSaveState)) ) {
      tokenlength = pNextToken - pToken;
      AnnotationFS annotFS = tcas->createAnnotation(annot, start, start+tokenlength-2);
      indexRep.addFS(annotFS);
      ++nTokens;
      start += tokenlength;
      pToken = pNextToken;
    }
    /* last token */
    tokenlength = pNextToken - pToken;
    AnnotationFS annotFS = tcas->createAnnotation(annot, start, streamSize);
    indexRep.addFS(annotFS);
    ++nTokens;
    cout << endl << "   Annotated " << nTokens << " tokens." << endl << endl;

    /** close the stream */
    pStream->close();
    delete pStream;
    delete[] pBuffer;

    cout << "SofaDataAnnotator: process() ends" << endl;
    return (TyErrorId)UIMA_ERR_NONE;
  }
Пример #2
0
  TyErrorId process(CAS & rCAS, ResultSpecification const & crResultSpecification) {
    CAS *engTcas, *germTcas;
    UChar *myLocalSaveState;

    // Look for english document and "translate" to German
    cout << "SofaExampleAnnotator: process() begins" << endl;

    // get English view
    engTcas = rCAS.getView("EnglishDocument");
    DocumentFS adocFS = engTcas->getDocumentAnnotation();
    UnicodeStringRef aengText = adocFS.getCoveredText();
    cout << "      English Input: " << aengText << endl;

    // Create the output German text Sofa and open CAS view
    germTcas = rCAS.createView("GermanDocument");

    // Get pointer to the English text document
    DocumentFS docFS = engTcas->getDocumentAnnotation();
    UnicodeStringRef engText = docFS.getCoveredText();

    // make copy of document for the u_strtok_r function (100 character limit!)
    UChar uWork[100];
    u_strncpy(uWork, engText.getBuffer(), 99);

    // Setup for translated text
    int germBegin = 0;
    int germEnd = 0;
    UChar translation[400];
    translation[0]=0;

    // get two IR handles for adding annotations to the appropriate view
    FSIndexRepository & engIndexRep = engTcas->getIndexRepository();
    FSIndexRepository & germIndexRep = germTcas->getIndexRepository();

    // Parse the English text
    UChar uDelim[2];
    UnicodeString delimUS(" ");
    u_strncpy(uDelim, delimUS.getBuffer(), 1);
	uDelim[1] = 0;
    UChar * next = u_strtok_r(uWork, uDelim, &myLocalSaveState);

    while (next) {
      // Create annotation on source text
      AnnotationFS engAnnot =
        engTcas->createAnnotation(annot, next-uWork, (next-uWork)+u_strlen(next));
      engIndexRep.addFS(engAnnot);

      // Translate word-by-word
      const UChar * gword = translate(next);

      // Accumulate the total translated document
      if (germBegin > 0) {
        // if not the first word, add space before
        u_strncat(translation, uDelim, 1);
        germBegin += 1;
      }
      u_strcat(translation, gword);

      // Create annotation on output text
      germEnd = germBegin + u_strlen(gword);
      AnnotationFS germAnnot = germTcas->createAnnotation(cross, germBegin, germEnd);
      germIndexRep.addFS(germAnnot);
      // add link to English text
      germAnnot.setFSValue(other, engAnnot);
      germBegin = germEnd;

      next = u_strtok_r(NULL, uDelim, &myLocalSaveState);
    }
    // set documentText with accumulated transation
    germTcas->setDocumentText( translation, u_strlen(translation), true );

    cout << "   German(!) Output: " << germTcas->getDocumentText() << endl;

    cout << "SofaExampleAnnotator: process() ends" << endl;
    return (TyErrorId)UIMA_ERR_NONE;
  }