Example #1
0
int AddConnection( char *s, Alignments &alignments, std::vector<struct _part> &parts )
{
    int n ;
    int i ;
    int cnt = 0 ;
    for ( i = 0 ; s[i] != ':'; ++i )
    {
        n = n * 10 + s[i] - '0' ;
    }

    //printf( "%s\n", s ) ;
    while ( 1 )
    {
        if ( s[i] == '(')
        {
            struct _part np ;
            int tag ;
            int stage = 0 ;
            tag = i + 1 ;
            for ( i = i + 1 ; s[i] != ')' ; ++i )
            {
                if ( s[i] == ' ' )
                {
                    if ( stage == 0 )
                    {
                        s[i] = '\0' ;
                        np.chrId = alignments.GetChromIdFromName( &s[tag] ) ;
                        s[i] = ' ' ;
                        tag = i + 1 ;
                    }
                    else if ( stage == 1 )
                    {
                        tag = i + 1 ;
                    }
                    else if ( stage == 2 )
                    {
                        s[i] = '\0' ;
                        np.contigId = atoi( &s[tag] ) ;
                        //printf( "%d\n", np.contigId ) ;
                        s[i] = ' ' ;
                    }
                    ++stage ;
                }
            }
            np.strand = s[i - 1] ;

            parts.push_back( np ) ;
        }
        else if ( s[i] == '\0' )
            break ;
        ++i ;
    }
    return n ;
}
Example #2
0
bool AlignmentsNode::getValue (const String& strMemberName, String& strValue)
{
    bool bValueSet = false;
    Alignments* pObject = dynamic_cast<Alignments*>(m_pObject);
    if (strMemberName == L"value")
    {
        ValueObject* pValueObj = dynamic_cast<ValueObject*>(m_pObject);
        if (pValueObj)
        {
            if (!pValueObj->isNothing())
            {
                strValue = pValueObj->toString();
                bValueSet = true;
            }
        }
    }
    else if (strMemberName == L"Desc")
    {
        if (pObject->hasValue_Desc())
        {
            strValue = (StringObjectImpl(pObject->getDesc())).toString();
            bValueSet = true;
        }
    }
    else if (strMemberName == L"Name")
    {
        if (pObject->hasValue_Name())
        {
            strValue = (StringObjectImpl(pObject->getName())).toString();
            bValueSet = true;
        }
    }
    else if (strMemberName == L"State")
    {
        if (pObject->hasValue_State())
        {
            strValue = (EnumStateTypeImpl(pObject->getState())).toString();
            bValueSet = true;
        }
    }
    return bValueSet;
}
Example #3
0
bool AlignmentsNode::setValue (const String& strMemberName, const String* pstrValue)
{
    bool bValueSet = false;
    Alignments* pObject = dynamic_cast<Alignments*>(m_pObject);
    if (strMemberName == L"Desc")
    {
        if (!pstrValue)
        {
            pObject->resetValue_Desc();
        }
        else
        {
            pObject->setDesc(StringObjectImpl::parseString(pstrValue->c_str(), pstrValue->length()));
            bValueSet = true;
        }
    }
    if (strMemberName == L"Name")
    {
        if (!pstrValue)
        {
            pObject->resetValue_Name();
        }
        else
        {
            pObject->setName(StringObjectImpl::parseString(pstrValue->c_str(), pstrValue->length()));
            bValueSet = true;
        }
    }
    if (strMemberName == L"State")
    {
        if (!pstrValue)
        {
            pObject->resetValue_State();
        }
        else
        {
            pObject->setState(EnumStateTypeImpl::parseString(pstrValue->c_str(), pstrValue->length()));
            bValueSet = true;
        }
    }
    return bValueSet;
}
Example #4
0
int main( int argc, char *argv[] )
{
    Alignments alignments ;
    Genome genome ;
    std::vector<int> rascafFileId ;

    char line[2048] ;
    char prefix[512] = "rascaf_scaffold" ;
    int rawAssemblyInd = 1 ;
    FILE *rascafFile ;
    bool contigLevel = false ;
    int i ;
    FILE *outputFile ;
    FILE *infoFile ;

    breakN = 1 ;

    if ( argc < 2 )
    {
        fprintf( stderr, "%s", usage ) ;
        exit( 1 ) ;
    }
    for ( i = 1 ; i < argc ; ++i )
    {
        if ( !strcmp( "-o", argv[i] ) )
        {
            strcpy( prefix, argv[i + 1 ] ) ;
            ++i ;
        }
        else if ( !strcmp( "-ms", argv[i] ) )
        {
            minSupport = atoi( argv[i + 1] ) ;
            ++i ;
        }
        else if ( !strcmp( "-ignoreGap", argv[i] ) )
        {
            ignoreGap = true ;
        }
        else if ( !strcmp( "-r", argv[i] ) )
        {
            rascafFileId.push_back( i + 1 ) ;
            ++i ;
        }
        else
        {
            fprintf( stderr, "Unknown option: %s\n", argv[i] ) ;
            exit( EXIT_FAILURE ) ;
        }

    }

    if ( rascafFileId.size() == 0 )
    {
        fprintf( stderr, "Must use -r to specify rascaf output file.\n" ) ;
        exit( EXIT_FAILURE ) ;
    }

    MAX_NEIGHBOR = 1 + rascafFileId.size() ;

    // Get the bam file.
    rascafFile = fopen( argv[ rascafFileId[0] ], "r" ) ;
    while ( fgets( line, sizeof( line ), rascafFile ) != NULL )
    {
        if ( strstr( line, "command line:" ) )
        {
            char *p ;
            char buffer[512] ;

            p = strstr( line, "-breakN" ) ;
            if ( p != NULL )
            {
                p += 7 ;
                while ( *p == ' ' )
                    ++p ;
                for ( i = 0 ; *p && *p != ' ' ; ++p, ++i )
                    buffer[i] = *p ;
                buffer[i] = '\0' ;
                breakN = atoi( buffer ) ;
            }

            p = strstr( line, "-b" ) ;
            if ( p == NULL )
            {
                fprintf( stderr, "Could not find the bam file specified by -b in Rascaf.\n" ) ;
                exit( 1 ) ;
            }
            p += 2 ;
            while ( *p == ' ' )
                ++p ;
            for ( i = 0 ; *p && *p != ' ' ; ++p, ++i )
                buffer[i] = *p ;
            buffer[i] = '\0' ;
            alignments.Open( buffer ) ;

            p = strstr( line, "-f") ;
            if ( p == NULL )
            {
                fprintf( stderr, "Could not find the raw assembly file specified by -f in Rascaf.\n" ) ;
                exit( 1 ) ;
            }
            p += 2 ;
            while ( *p == ' ' )
                ++p ;
            for ( i = 0 ; *p && *p != ' ' && *p != '\n' ; ++p, ++i )
                buffer[i] = *p ;
            buffer[i] = '\0' ;
            fprintf( stderr, "Found raw assembly file: %s\n", buffer ) ;
            genome.Open( alignments, buffer ) ;

            break ;
        }
    }
    fclose( rascafFile ) ;
    // Parse the input.
    for ( unsigned int fid = 0 ; fid < rascafFileId.size() ; ++fid )
    {
        rascafFile = fopen( argv[ rascafFileId[fid] ], "r" ) ;
        bool start = false ;
        int tag ;
        while ( fgets( line, sizeof( line ), rascafFile ) != NULL )
        {
            if ( strstr( line, "command line:" ) )
            {
                start = true ;
                if ( strstr( line, "-f" ) )
                {
                    contigLevel = true ;
                }

                continue ;
            }

            if ( !start )
                continue ;

            if ( !strcmp( line, "WARNINGS:\n" ) )
                break ;

            std::vector<struct _part> nparts  ;
            if ( line[0] >= '0' && line[0] <= '9' )
            {
                AddConnection( line, alignments, nparts ) ;
                connects.push_back( nparts ) ;
                tag = 0 ;
            }
            else if ( line[0] == '\t' || line[0] == ' ' )
            {
                // Break the nparts if the support is too low.
                int num = 0 ;
                for ( i = 0 ; line[i] < '0' || line[i] > '9' ; ++i )
                    ;
                for ( ; line[i] >= '0' && line[i] <= '9' ; ++i )
                    num = num * 10 + line[i] - '0' ;
                ++tag ;
                if ( num < minSupport )
                {
                    nparts = connects.back() ;
                    connects.pop_back() ;
                    int size = nparts.size() ;

                    std::vector<struct _part> newNParts ;
                    for ( i = 0 ; i < tag ; ++i )
                        newNParts.push_back( nparts[i] ) ;
                    if ( newNParts.size() > 1 )
                        connects.push_back( newNParts ) ;

                    newNParts.clear() ;
                    for ( ; i < size ; ++i  )
                        newNParts.push_back( nparts[i] ) ;
                    if ( newNParts.size() > 1 )
                        connects.push_back( newNParts ) ;

                    tag = 0 ;
                }
            }
        }
        fclose( rascafFile ) ;
    }
    if ( contigLevel == false )
    {
        genome.SetIsOpen( contigLevel ) ;
    }

    // Build the graph
    int contigCnt = genome.GetContigCount() ;
    int edgeCnt = 0 ;
    int csize = connects.size() ;

    for ( i = 0 ; i < csize ; ++i )
        edgeCnt += connects[i].size() ;

    ContigGraph contigGraph( contigCnt, contigCnt + edgeCnt ) ;
    for ( i = 0 ; i < contigCnt - 1 ; ++i )
    {
        if ( genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( i + 1 ) )
        {
            contigGraph.AddEdge( i, 1, i + 1, 0 ) ;
        }
    }
    for ( i = 0 ; i < csize ; ++i )
    {
        std::vector<struct _part> &parts = connects[i] ;
        int size = parts.size() ;
        for ( int j = 0 ; j < size - 1 ; ++j )
        {
            struct _part &a = parts[j] ;
            struct _part &b = parts[j + 1] ;

            // Two dummy nodes for each contig. Left is 0, right is 1
            int dummyU = 0 ;
            int dummyV = 0 ;
            if ( a.strand == '+' )
                dummyU = 1 ;
            if ( b.strand == '-' )
                dummyV = 1 ;
            contigGraph.AddEdge( a.contigId, dummyU, b.contigId, dummyV, true ) ;
        }
    }

    // Check the cycles in the contig graph. This may introduces when combining different rascaf outputs.
    int *visitTime = new int[contigCnt] ;
    struct _pair *neighbors = new struct _pair[ MAX_NEIGHBOR ] ;

    bool *isInCycle = new bool[contigCnt] ;
    std::vector<int> cycleNodes ;
    memset( visitTime, -1, sizeof( int ) * contigCnt ) ;
    memset( isInCycle, false, sizeof( bool ) * contigCnt ) ;
    for ( i = 0 ; i < contigCnt ; ++i )
    {
        if ( isInCycle[i] )
            continue ;
        if ( contigGraph.IsInCycle( i, cycleNodes, visitTime ) )
        {
            int cnt = cycleNodes.size() ;
            //printf( "===\n") ;
            for ( int j = 0 ; j < cnt ; ++j )
            {
                //printf( "In cycle %d\n", cycleNodes[j] ) ;
                isInCycle[ cycleNodes[j] ] = true ;
            }
        }
    }
    //exit( 1 ) ;
    // Remove the connected edges involving the nodes in the cycle
    for ( i = 0 ; i < contigCnt ; ++i )
    {
        if ( isInCycle[i] )
        {
            for ( int dummy = 0 ; dummy <= 1 ; ++dummy )
            {
                int ncnt = contigGraph.GetNeighbors( i, dummy, neighbors, MAX_NEIGHBOR ) ;
                for ( int j = 0 ; j < ncnt ; ++j )
                {
                    if ( neighbors[j].a == i + 2 * dummy - 1 && neighbors[j].b != dummy
                            && genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( neighbors[j].a ) )
                        continue ; // the connection created by the raw assembly
                    else
                        contigGraph.RemoveEdge( i, dummy, neighbors[j].a, neighbors[j].b ) ;
                }
            }
        }
    }
    //delete[] isInCycle ;
    //printf( "hi: %d %d\n", __LINE__, contigCnt ) ;
    //printf( "%d %d\n", contigGraph.GetNeighbors( 0, 0, neighbors, MAX_NEIGHBOR ), contigGraph.GetNeighbors( 0, 1, neighbors, MAX_NEIGHBOR ) ) ;
    // Sort the scaffolds from fasta file, so that longer scaffold come first
    int scafCnt = genome.GetChrCount() ;
    struct _pair *scafInfo = new struct _pair[scafCnt] ;
    memset( scafInfo, -1, sizeof( struct _pair) * scafCnt ) ;
    for ( i = 0 ; i < contigCnt ; ++i )
    {
        int chrId = genome.GetChrIdFromContigId( i ) ;
        if ( scafInfo[chrId].a == -1 )
        {
            scafInfo[ chrId ].a = i ;
            scafInfo[ chrId ].b = genome.GetChrLength( chrId ) ;
        }
    }
    qsort( scafInfo, scafCnt, sizeof( struct _pair ), CompScaffold ) ;

    // Merge the branches and build the scaffold
    ContigGraph scaffold( contigCnt, 2 * contigCnt ) ;

    // Use a method similar to topological sort
    bool *used = new bool[contigCnt] ;
    int *degree = new int[2 *contigCnt] ;
    int *danglingVisitTime = new int[contigCnt] ;
    int *counter = new int[contigCnt] ;
    int *visitDummy = new int[ contigCnt ] ;
    int *buffer = new int[contigCnt] ;
    int *buffer2 = new int[contigCnt] ;
    bool *isInQueue = new bool[ contigCnt ] ;
    int *chosen = new int[contigCnt] ;
    int chosenCnt ;

    memset( isInCycle, false, sizeof( bool ) * contigCnt ) ;
    memset( visitTime, -1, sizeof( int ) * contigCnt ) ;
    memset( visitDummy, -1, sizeof( int ) * contigCnt ) ;
    memset( counter, -1, sizeof( int ) * contigCnt ) ;

    // Use those memory to remove triangular cycles
    for ( i = 0 ; i < scafCnt ; ++i )
    {
        int from, to ;
        if ( scafInfo[i].a == -1 )
            continue ;
        genome.GetChrContigRange( genome.GetChrIdFromContigId( scafInfo[i].a ), from, to ) ;
        ForwardSearch( from, 0, i, visitTime, counter, visitDummy, contigGraph ) ;
        chosenCnt = 0 ;
        BackwardSearchForTriangularCycle( to, 1, i, visitTime, counter, visitDummy, contigGraph, chosen, chosenCnt ) ;

        for ( int j = 0 ; j < chosenCnt ; ++j )
        {
            //printf( "%d\n", chosen[j] ) ;
            isInCycle[ chosen[j] ] = true ;
        }
    }

    for ( i = 0 ; i < contigCnt ; ++i )
    {
        if ( isInCycle[i] )
        {
            for ( int dummy = 0 ; dummy <= 1 ; ++dummy )
            {
                int ncnt = contigGraph.GetNeighbors( i, dummy, neighbors, MAX_NEIGHBOR ) ;
                for ( int j = 0 ; j < ncnt ; ++j )
                {
                    if ( neighbors[j].a == i + 2 * dummy - 1 && neighbors[j].b != dummy
                            && genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( neighbors[j].a ) )
                        continue ; // the connection created by the raw assembly
                    else
                        contigGraph.RemoveEdge( i, dummy, neighbors[j].a, neighbors[j].b ) ;
                }
            }
        }
    }


    memset( used, false, sizeof( bool ) * contigCnt ) ;
    memset( visitTime, -1, sizeof( int ) * contigCnt ) ;
    memset( visitDummy, -1, sizeof( int ) * contigCnt ) ;
    memset( danglingVisitTime, -1, sizeof( int ) * contigCnt ) ;
    memset( counter, -1, sizeof( int ) * contigCnt ) ;
    memset( isInQueue, false, sizeof( bool ) * contigCnt ) ;
    ContigGraph newGraph( contigCnt, edgeCnt ) ;

    // Compute the gap size
    int *gapSize = new int[contigCnt] ;
    for ( i = 0 ; i < contigCnt - 1 ; ++i )
    {
        if ( genome.GetChrIdFromContigId( i ) == genome.GetChrIdFromContigId( i + 1 ) )
        {
            struct _contig c1 = genome.GetContigInfo( i ) ;
            struct _contig c2 = genome.GetContigInfo( i + 1 ) ;
            gapSize[i] = c2.start - c1.end - 1 ;
        }
        else
            gapSize[i] = -1 ;
    }

    // Start search
    int ncnt ;
    struct _pair *queue = new struct _pair[ contigCnt ] ;
    int head = 0, tail ;
    int danglingTime = 0 ;

    // Pre-allocate the subgraph.
    ContigGraph subgraph( contigCnt, 3 * contigCnt ) ;
    for ( i = 0 ; i < scafCnt ; ++i )
    {
        //if ( used[144281] == true )
        //	printf( "changed %d %d\n", i, scafInfo[i - 1].a ) ;
        if ( scafInfo[i].a == -1 )
            continue ;
        int from, to ;
        genome.GetChrContigRange( genome.GetChrIdFromContigId( scafInfo[i].a ), from, to ) ;
        //printf( "%d: %d %d %d\n", i, scafInfo[i].b, from, to ) ;
        ForwardSearch( from, 0, i, visitTime, counter, visitDummy, contigGraph ) ;
        chosenCnt = 0 ;
        BackwardSearch( to, 1, i, visitTime, counter, contigGraph, chosen, chosenCnt ) ;

        /*printf( "%s %d (%d %d) %d\n", alignments.GetChromName( genome.GetChrIdFromContigId( scafInfo[i].a ) ), i, from, to, chosenCnt ) ;
        if ( chosenCnt > 1 )
        {
        	printf( "=== " ) ;
        	for ( int j = 0 ; j < chosenCnt ; ++j )
        		printf( "%d ", chosen[j] ) ;
        	printf( "\n" ) ;
        }*/

        for ( int j = 0 ; j < chosenCnt ; ++j )
        {
            ncnt = contigGraph.GetNeighbors( chosen[j], 0, neighbors, MAX_NEIGHBOR ) ;
            //printf( "%d %d %d: %d %d %d\n", j, chosen[j], ncnt, neighbors[0].a, visitTime[ neighbors[0].a ],
            //	counter[neighbors[0].a ] ) ;
            for ( int k = 0 ; k < ncnt ; ++k )
            {
                //if ( i == 639 )
                //	printf( "Neighbor from 0 %d: %d %d\n", k, neighbors[k].a, neighbors[k].b ) ;
                if ( visitTime[ neighbors[k].a ] == 2 * i + 1 && counter[neighbors[k].a ] == 2 )
                {
                    subgraph.AddEdge( chosen[j], 0, neighbors[k].a, neighbors[k].b, true ) ;
                    //printf( "subgraph: (%d %d)=>(%d %d)\n", chosen[j], 0, neighbors[k].a, neighbors[k].b ) ;
                }
            }

            ncnt = contigGraph.GetNeighbors( chosen[j], 1, neighbors, MAX_NEIGHBOR ) ;
            for ( int k = 0 ; k < ncnt ; ++k )
            {
                //if ( i == 639 )
                //	printf( "Neighbor from 1 %d: %d %d\n", k, neighbors[k].a, neighbors[k].b ) ;
                if ( visitTime[ neighbors[k].a ] == 2 * i + 1 && counter[neighbors[k].a ] == 2 )
                {
                    subgraph.AddEdge( chosen[j], 1, neighbors[k].a, neighbors[k].b, true ) ;
                    //printf( "subgraph: (%d %d)=>(%d %d)\n", chosen[j], 1, neighbors[k].a, neighbors[k].b ) ;
                }
            }
        }

        // Initialize the degree counter
        for ( int j = 0 ; j < chosenCnt ; ++j )
        {
            for ( int l = 0 ; l < 2 ; ++l )
            {
                /*if ( i == 6145 )
                {
                	std::vector<struct _pair> neighbors ;
                	ncnt = subgraph.GetNeighbors( chosen[j], l, neighbors ) ;
                	printf( "%d ncnt=%d\n", l, ncnt ) ;
                }*/

                ncnt = subgraph.GetNeighbors( chosen[j], l, neighbors, MAX_NEIGHBOR ) ;
                degree[ 2 * chosen[j] + l ] = ncnt ;
            }
        }

        // "topological" sort
        head = 0 ;
        isInQueue[from] = true ;
        queue[0].a = from ;
        queue[0].b = 0 ;
        tail = 1 ;
        int prevTag = -1 ;
        int *prevAdd = buffer ; // reuse counter to save some memory.
        int *nextAdd = buffer2 ;
        int firstAdd = -1 ;

        while ( head < tail )
        {
            int tailTag = tail ;
            for ( int j = head ; j < tailTag ; ++j )
            {
                nextAdd[j] = -1 ;
                if ( !used[ queue[j].a ] )
                {
                    used[ queue[j].a ] = true ;
                    if ( prevTag != -1 )
                    {
                        scaffold.AddEdge( queue[ prevTag].a, 1 - queue[prevTag].b, queue[j].a, queue[j].b ) ;
                        nextAdd[ prevTag ] = j ;

                        /*if ( i == 639 )
                        	printf( "(%lld %lld)=>(%lld %lld)\n", queue[ prevTag].a, 1 - queue[prevTag].b, queue[j].a, queue[j].b ) ;*/
                    }
                    else
                        firstAdd = j ;

                    prevTag = j ;
                }
                prevAdd[j] = prevTag ; // the most recent(<=) queue id when added to scaffold.

                ncnt = subgraph.GetNeighbors( queue[j].a, 1 - queue[j].b, neighbors, MAX_NEIGHBOR ) ;
                for ( int k = 0 ; k < ncnt ; ++k )
                {
                    --degree[ 2 * neighbors[k].a + neighbors[k].b ] ;
                    if ( degree[ 2 * neighbors[k].a + neighbors[k].b ] == 0 && !isInQueue[neighbors[k].a] )
                    {
                        isInQueue[ neighbors[k].a ] = true ;
                        queue[ tail ] = neighbors[k] ; // Interesting assignment, I think.
                        ++tail ;
                        /*if ( i == 639 )
                        	printf( "pushed in queue: %d\n", neighbors[k].a ) ;*/
                        // Put the consecutive contigs together.
                        struct _pair testNeighbors[ MAX_NEIGHBOR ] ;
                        struct _pair tag ;
                        tag = neighbors[k] ;
                        while ( 1 )
                        {
                            if ( contigGraph.GetNeighbors( tag.a, 1 - tag.b, testNeighbors, MAX_NEIGHBOR ) != 1 )
                                break ;
                            int n = subgraph.GetNeighbors( tag.a, 1 - tag.b, testNeighbors, MAX_NEIGHBOR ) ;
                            if ( n != 1 )
                                break ;
                            //printf( "%d %d\n", n, testNeighbors[0].a ) ;

                            struct _pair backNeighbors[ MAX_NEIGHBOR ] ;
                            if ( contigGraph.GetNeighbors( testNeighbors[0].a, testNeighbors[0].b, backNeighbors, MAX_NEIGHBOR ) != 1 )
                                break ;
                            n = subgraph.GetNeighbors( testNeighbors[0].a, testNeighbors[0].b, backNeighbors, MAX_NEIGHBOR ) ;
                            if ( n != 1 )
                                break ;
                            isInQueue[ testNeighbors[0].a ] = true ;
                            queue[tail] = testNeighbors[0] ;
                            ++tail ;
                            /*if ( i == 639 )
                            	printf( "pushed in queue: %d\n", testNeighbors[0].a ) ;*/
                            tag = testNeighbors[0] ;
                        }
                    }
                }
            }

            head = tailTag ;
        }
        // Remove the effect on the subgraph.
        /*if ( tail != chosenCnt )
        {
        	printf( "WARNING: not matched\n" ) ;
        	exit( 1 ) ;
        }*/
        for ( int j = 0 ; j < tail ; ++j )
        {
            visitDummy[ queue[j].a ] = -1 ;
            counter[ queue[j].a ] = -1 ;
            subgraph.RemoveAdjacentEdges( queue[j].a ) ;
            isInQueue[ queue[j].a ] = false ;
        }
        subgraph.ResetEdgeUsed() ;

        // no point is picked
        if ( prevTag == -1 )
        {
            continue ;
        }

        // Update the gap size
        prevTag = -1 ;
        for ( int j = 0 ; j < tail - 1 ; ++j )
        {
            if ( genome.GetChrIdFromContigId( queue[j].a ) == genome.GetChrIdFromContigId( from ) )
                prevTag = queue[j].a ;
            else if ( prevTag != -1 )
            {
                struct _contig c = genome.GetContigInfo( queue[j].a ) ;
                gapSize[prevTag] -= ( c.end - c.start + 1) ;
            }
        }
        // Add the dangling contigs. Use the fact that the queue holding the contigs in the same order as in the scaffold.
        // 5'->3' dangling
        int *chosenDummy = degree ;
        for ( int j = tail - 1 ; j >= 0 ; --j )
        {
            //if ( j < tail - 1 )
            //	continue ;
            chosenCnt = 0 ;
            //if ( queue[j].a == 0 )
            //	printf( "Dummy: %d %d %d\n", j, queue[j].b, 1 - queue[j].b ) ;
            SearchDangling( queue[j].a, queue[j].b, used, danglingTime, danglingVisitTime, contigGraph, false, chosen, chosenDummy, chosenCnt, genome ) ;
            ++danglingTime ;
            int prevTag = prevAdd[j] ;
            /*if ( queue[j].a == 0 )
            {
            	struct _pair neighbors[5] ;
            	int ncnt = contigGraph.GetNeighbors( queue[j].a, 1 - queue[j].b, neighbors, 5 ) ;
            	printf( "%d %d %d %d: %d %d\n", queue[j].b, chosenCnt, prevTag, ncnt, neighbors[0].a, used[ neighbors[0].a ] ) ;
            }*/
            if ( prevTag == -1 )
                break ;

            // Trim the dangling list
            int k = chosenCnt - 1 ;
            if ( j > 0 && j < tail - 1 )
            {
                for ( k = chosenCnt - 1 ; k >= 1 ; --k )
                    if ( genome.GetChrIdFromContigId( chosen[k] ) != genome.GetChrIdFromContigId( chosen[k - 1] ) )
                        break ;
            }

            // Test the gap size
            int len = 0 ;
            for ( int l = 0 ; l <= k ; ++l )
            {
                struct _contig c = genome.GetContigInfo( chosen[k] ) ;
                len += c.end - c.start + 1 ;
            }

            if ( j < tail - 1 )
            {
                int l ;
                for ( l = j ; l >= 0 ; --l )
                    if ( genome.GetChrIdFromContigId( queue[l].a ) == genome.GetChrIdFromContigId( from ) )
                        break ;

                if ( !ignoreGap && len >= gapSize[ queue[l].a ] + 100 )
                    continue ;
                else
                    gapSize[ queue[l].a ] -= len ;
            }


            for ( ; k >= 0 ; --k )
            {
                used[ chosen[k] ] = true ;
                //printf( "Dangling 1: %d=>%d\n", queue[prevTag].a, chosen[k] ) ;
                scaffold.InsertNode( queue[ prevTag ].a, 1 - queue[ prevTag ].b, chosen[k], chosenDummy[k] ) ;
            }
        }

        // 3'->5' dangling
        for ( int j = 0 ; j < tail ; ++j )
        {
            //if ( j > 0 )
            //	continue ;
            chosenCnt = 0 ;
            SearchDangling( queue[j].a, 1 - queue[j].b, used, danglingTime, danglingVisitTime, contigGraph, false, chosen, chosenDummy, chosenCnt, genome ) ;
            ++danglingTime ;

            int prevTag = prevAdd[j] ;
            int nextTag ;
            if ( prevTag == -1 || j <= firstAdd )
                nextTag = firstAdd ;
            else if ( j == prevTag )
                nextTag = j ;
            else
                nextTag = nextAdd[ prevTag ] ;
            if ( nextTag == -1 )
                break ;
            /*if ( queue[j].a == 37549 )
            {
            	struct _pair neighbors[5] ;
            	int ncnt = contigGraph.GetNeighbors( queue[j].a, queue[j].b, neighbors, 5 ) ;
            	fprintf( stderr, "%d %d %d: %d %d %d: %d %d %d\n", j, queue[j].a, queue[j].b, chosenCnt, nextTag, ncnt, chosen[0], chosenDummy[0], used[ chosen[0] ] ) ;
            }*/

            // trim the danling list
            int k = chosenCnt - 1 ;
            if ( j < tail - 1 && j > 0 )
            {
                for ( k = chosenCnt - 1 ; k >= 1 ; --k )
                    if ( genome.GetChrIdFromContigId( chosen[k] ) != genome.GetChrIdFromContigId( chosen[k - 1] ) )
                        break ;
            }

            // Test the gap size
            int len = 0 ;
            for ( int l = 0 ; l <= k ; ++l )
            {
                struct _contig c = genome.GetContigInfo( chosen[k] ) ;
                len += c.end - c.start + 1 ;
            }

            if ( j > 0 )
            {
                int l ;
                for ( l = j - 1 ; l >= 0 ; --l ) // Notice the j-1 here, because we want the gap strictly before current contig
                    if ( genome.GetChrIdFromContigId( queue[l].a ) == genome.GetChrIdFromContigId( from ) )
                        break ;

                if ( !ignoreGap && len >= gapSize[ queue[l].a ] + 100 )
                    continue ;
                else
                    gapSize[ queue[l].a ] -= len ;
            }

            for ( ; k >= 0 ; --k )
            {
                used[ chosen[k] ] = true ;
                scaffold.InsertNode( queue[nextTag].a, queue[nextTag].b, chosen[k], chosenDummy[k] ) ;
                //printf( "Dangling 2: %d<=%d\n", queue[nextTag].a, chosen[k] ) ;
                //if ( chosen[k] == 10246 )
                //	printf( "hi %d %d %d %d\n", j, queue[j].a, k, chosen[k] ) ;
            }
        }
    }
    //return 0 ;

    // Output the scaffold
    int id = 0 ;
    char infoFileName[512] ;
    char outputFileName[512] ;
    sprintf( infoFileName, "%s.info", prefix ) ;
    sprintf( outputFileName, "%s.fa", prefix ) ;

    outputFile = fopen( outputFileName, "w" ) ;
    infoFile = fopen( infoFileName, "w") ;

    memset( used, false, sizeof( bool ) * contigCnt ) ;
    for ( i = 0 ; i < contigCnt ; ++i )
    {
        //printf( "%d (%s)\n", i, alignments.GetChromName( genome.GetChrIdFromContigId( i ) )  ) ; fflush( stdout ) ;
        /*if ( i == 10246 )
        {
        	std::vector<struct _pair> neighbors ;
        	scaffold.GetNeighbors( i, 0, neighbors ) ;
        	printf( "%u\n", neighbors.size() ) ;
        }*/
        if ( used[i] )
            continue ;
        int ncnt1 = scaffold.GetNeighbors( i, 0, neighbors, MAX_NEIGHBOR ) ;
        int ncnt2 = scaffold.GetNeighbors( i, 1, neighbors, MAX_NEIGHBOR ) ;
        if ( ncnt1 == 0 || ncnt2 == 0 ) // The end of a scaffold
        {
            fprintf( outputFile, ">scaffold_%d\n", id) ;
            fprintf( infoFile, ">scaffold_%d", id ) ;
            ++id ;
            int p = i ;
            int dummyP = 1 ;
            if ( ncnt1 == 0 )
                dummyP = 0 ;

            used[i] = true ;
            genome.PrintContig( outputFile, i, dummyP ) ;
            fprintf( infoFile, " (%s %d %c)", alignments.GetChromName( genome.GetChrIdFromContigId( p ) ), p, dummyP == 0 ? '+' : '-' ) ;
            while ( 1 )
            {
                ncnt = scaffold.GetNeighbors( p, 1 - dummyP, neighbors, MAX_NEIGHBOR ) ;
                if ( ncnt == 0 )
                    break ;
                // ncnt must be 1
                int insertN = 17 ;

                if ( genome.GetChrIdFromContigId( p ) == genome.GetChrIdFromContigId( neighbors[0].a ) )
                {
                    struct _contig cp, cna ;
                    cp = genome.GetContigInfo( p ) ;
                    cna = genome.GetContigInfo( neighbors[0].a ) ;
                    if ( p < neighbors[0].a )
                        insertN = cna.start - cp.end - 1 ;
                    else if ( p > neighbors[0].a )
                        insertN = cp.start - cna.end - 1 ;

                }

                p = neighbors[0].a ;
                dummyP = neighbors[0].b ;
                for ( int j = 0 ; j < insertN ; ++j )
                    fprintf( outputFile, "N" ) ;
                used[p] = true ;
                genome.PrintContig( outputFile, p, dummyP ) ;

                fprintf( infoFile, " (%s %d %c)", alignments.GetChromName( genome.GetChrIdFromContigId( p ) ), p, dummyP == 0 ? '+' : '-' ) ;
            }
            fprintf( outputFile, "\n" ) ;
            fprintf( infoFile, "\n" ) ;
        }
    }

    for ( i = 0 ; i < contigCnt ; ++i )
        if ( !used[i] )
        {
            fprintf( stderr, "Unreported contig %d.\n", i ) ;
        }
    fclose( outputFile ) ;
    fclose( infoFile ) ;

    delete[] buffer ;
    delete[] buffer2 ;
    delete[] chosen ;
    delete[] queue ;
    delete[] counter ;
    delete[] visitTime ;
    delete[] used ;
    delete[] scafInfo ;
    delete[] isInQueue ;
    delete[] gapSize ;

    //fclose( rascafFile ) ;
    return 0 ;
}
void Manager::OutputNBestList(OutputCollector *collector,
                              const KBestExtractor::KBestVec &nBestList,
                              long translationId) const
{
  const StaticData &staticData = StaticData::Instance();

  const std::vector<FactorType> &outputFactorOrder =
    staticData.GetOutputFactorOrder();

  std::ostringstream out;

  if (collector->OutputIsCout()) {
    // Set precision only if we're writing the n-best list to cout.  This is to
    // preserve existing behaviour, but should probably be done either way.
    FixPrecision(out);
  }

  bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
  bool PrintNBestTrees = staticData.PrintNBestTrees();

  for (KBestExtractor::KBestVec::const_iterator p = nBestList.begin();
      p != nBestList.end(); ++p) {
    const KBestExtractor::Derivation &derivation = **p;

    // get the derivation's target-side yield
    Phrase outputPhrase = KBestExtractor::GetOutputPhrase(derivation);

    // delete <s> and </s>
    UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
        "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
    outputPhrase.RemoveWord(0);
    outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);

    // print the translation ID, surface factors, and scores
    out << translationId << " ||| ";
    OutputSurface(out, outputPhrase, outputFactorOrder, false);
    out << " ||| ";
    OutputAllFeatureScores(derivation.scoreBreakdown, out);
    out << " ||| " << derivation.score;

    // optionally, print word alignments
    if (includeWordAlignment) {
      out << " ||| ";
      Alignments align;
      OutputAlignmentNBest(align, derivation, 0);
      for (Alignments::const_iterator q = align.begin(); q != align.end();
          ++q) {
        out << q->first << "-" << q->second << " ";
      }
    }

    // optionally, print tree
    if (PrintNBestTrees) {
      TreePointer tree = KBestExtractor::GetOutputTree(derivation);
      out << " ||| " << tree->GetString();
    }

    out << std::endl;
  }

  assert(collector);
  collector->Write(translationId, out.str());
}
std::size_t Manager::OutputAlignmentNBest(
    Alignments &retAlign,
    const KBestExtractor::Derivation &derivation,
    std::size_t startTarget) const
{
  const SHyperedge &shyperedge = derivation.edge->shyperedge;

  std::size_t totalTargetSize = 0;
  std::size_t startSource = shyperedge.head->pvertex->span.GetStartPos();

  const TargetPhrase &tp = *(shyperedge.translation);

  std::size_t thisSourceSize = CalcSourceSize(derivation);

  // position of each terminal word in translation rule, irrespective of
  // alignment if non-term, number is undefined
  std::vector<std::size_t> sourceOffsets(thisSourceSize, 0);
  std::vector<std::size_t> targetOffsets(tp.GetSize(), 0);

  const AlignmentInfo &aiNonTerm = shyperedge.translation->GetAlignNonTerm();
  std::vector<std::size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
  const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd =
      aiNonTerm.GetNonTermIndexMap();

  UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
      "Error");

  std::size_t targetInd = 0;
  for (std::size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
    if (tp.GetWord(targetPos).IsNonTerminal()) {
      UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
      std::size_t sourceInd = targetPos2SourceInd[targetPos];
      std::size_t sourcePos = sourceInd2pos[sourceInd];

      const KBestExtractor::Derivation &subderivation =
        *derivation.subderivations[sourceInd];

      // calc source size
      std::size_t sourceSize =
        subderivation.edge->head->svertex.pvertex->span.GetNumWordsCovered();
      sourceOffsets[sourcePos] = sourceSize;

      // calc target size.
      // Recursively look thru child hypos
      std::size_t currStartTarget = startTarget + totalTargetSize;
      std::size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
          currStartTarget);
      targetOffsets[targetPos] = targetSize;

      totalTargetSize += targetSize;
      ++targetInd;
    } else {
      ++totalTargetSize;
    }
  }

  // convert position within translation rule to absolute position within
  // source sentence / output sentence
  ShiftOffsets(sourceOffsets, startSource);
  ShiftOffsets(targetOffsets, startTarget);

  // get alignments from this hypo
  const AlignmentInfo &aiTerm = shyperedge.translation->GetAlignTerm();

  // add to output arg, offsetting by source & target
  AlignmentInfo::const_iterator iter;
  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
    const std::pair<std::size_t, std::size_t> &align = *iter;
    std::size_t relSource = align.first;
    std::size_t relTarget = align.second;
    std::size_t absSource = sourceOffsets[relSource];
    std::size_t absTarget = targetOffsets[relTarget];

    std::pair<std::size_t, std::size_t> alignPoint(absSource, absTarget);
    std::pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
    UTIL_THROW_IF2(!ret.second, "Error");
  }

  return totalTargetSize;
}
Example #7
0
int main( int argc, char *argv[] )
{
	int i ;
	int ret ;

	Alignments alignments ;
	Alignments clippedAlignments ;
	Blocks blocks ;
	Genome genome ;
	char *genomeFile = NULL ;
	
	if ( argc < 2 )
	{
		printf( "%s", usage ) ;
		exit( 0 ) ;
	}

	minimumSupport = 2 ;
	minimumEffectiveLength = 200 ;
	kmerSize = 23 ;
	breakN = 1 ;
	minContigSize = 200 ;
	prefix = NULL ;
	VERBOSE = false ;
	outputConnectionSequence = false ;
	aggressiveMode = false ;

	for ( i = 1 ; i < argc ; ++i )
	{
		if ( !strcmp( "-b", argv[i] ) )
		{
			alignments.Open( argv[i + 1]) ;
			++i ;
		}
		else if ( !strcmp( "-o", argv[i] ) )
		{
			prefix = argv[i + 1] ;
			++i ;
		}
		else if ( !strcmp( "-f", argv[i] ) )
		{
			genomeFile = argv[i + 1] ;
			++i ;
		}
		else if ( !strcmp( "-ms", argv[i] ) )
		{
			minimumSupport = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-ml", argv[i] ) )
		{
			minimumEffectiveLength = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-k", argv[i] ) )
		{
			kmerSize = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-breakN", argv[i] ) )
		{
			breakN = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-minContigSize", argv[i] ) )
		{
			minContigSize = atoi( argv[i + 1] ) ;
			++i ;
		}
		else if ( !strcmp( "-v", argv[i] ) )
		{
			VERBOSE = true ;
		}
		else if ( !strcmp( "-cs", argv[i] ) )
		{
			outputConnectionSequence = true ;
		}
		/*else if ( !strcmp( "-aggressive", argv[i] ) )
		{
			aggressiveMode = true ;
		}*/
		else if ( !strcmp( "-bc", argv[i] ) )
		{
			// So far, assume the input is from BWA mem
			clippedAlignments.Open( argv[i + 1] ) ;
			clippedAlignments.SetAllowSupplementary( true ) ;
			++i ;
		}
		else
		{
			fprintf( stderr, "Unknown parameter: %s\n", argv[i] ) ;
			exit( 1 ) ;
		}
	}

	if ( !alignments.IsOpened() )
	{
		printf( "Must use -b to specify the bam file." ) ;
		return 0 ;
	}

	if ( prefix != NULL )
	{
		char buffer[255] ;
		sprintf( buffer, "%s.out", prefix ) ;
		fpOut = fopen( buffer, "w" ) ;
	}
	else
	{
		char buffer[255] ;
		prefix = strdup( "rascaf" ) ;
		sprintf( buffer, "%s.out", prefix ) ;
		fpOut = fopen( buffer, "w" ) ;
	}
	
	if ( genomeFile != NULL )
	{
		genome.Open( alignments, genomeFile ) ;
		alignments.Rewind() ;
	}

	if ( outputConnectionSequence == true && genomeFile == NULL )
	{
		fprintf( stderr, "Must use -f to specify assembly file when using -cs\n" ) ;	
		exit( EXIT_FAILURE ) ;
	}
	// 74619
	//printf( "%c\n", genome.GetNucleotide( 74619, 4 ) ) ;
	//exit(0) ;
	// Build the graph
	ret = blocks.BuildExonBlocks( alignments, genome ) ;
	alignments.Rewind() ;
	fprintf( stderr, "Found %d exon blocks.\n", ret ) ;
	if ( clippedAlignments.IsOpened() )
	{
		fprintf( stderr, "Extend exon blocks with clipped alignments.\n" ) ;
		Blocks extendBlocks ;
		extendBlocks.BuildExonBlocks( clippedAlignments, genome ) ;
		clippedAlignments.Rewind() ;

		ret = blocks.ExtendExonBlocks( extendBlocks ) ;
		fprintf( stderr, "Found %d exon blocks after extension.\n", ret ) ;
	}

	blocks.GetAlignmentsInfo( alignments ) ;
	alignments.Rewind() ;

	ret = blocks.BuildGeneBlocks( alignments, genome ) ;
	alignments.Rewind() ;
	fprintf( stderr, "Found %d gene blocks.\n", ret ) ;
	
	blocks.BuildGeneBlockGraph( alignments ) ;
	if ( clippedAlignments.IsOpened() )
	{
		blocks.AddGeneBlockGraphByClippedAlignments( clippedAlignments ) ; 
	}
	
	// Cleaning
	blocks.CleanGeneBlockGraph( alignments, genome ) ;

	// Scaffolding
	Scaffold scaffold( blocks, genome ) ;
	//scaffold.Init( blocks ) ;
	int componentCnt = scaffold.BuildComponent() ;
	fprintf( stderr, "Found %d non-trivial gene block components.\n", componentCnt ) ;
	// Possible for parallelization
	for ( i = 0 ; i < componentCnt ; ++i )
	{
		scaffold.ScaffoldComponent( i ) ;
	}
	
	scaffold.ScaffoldGenome() ;
	
	// Output the command line
	fprintf( fpOut, "command line: " ) ;
	char *fullpath = (char *)malloc( sizeof( char ) * 4096 ) ;
	for ( i = 0 ; i < argc ; ++i )
	{
		char c = ' ' ;
		if ( i == argc - 1 )
			c = '\n' ;
		if ( i > 0 && !strcmp( argv[i - 1], "-b" ) )
		{
			if ( realpath( argv[i], fullpath ) == NULL )
			{
				fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ;
				exit( 1 ) ;
			}
			fprintf( fpOut, "%s%c", fullpath, c ) ;
		}
		else if ( i > 0 && !strcmp( argv[i - 1], "-f" ) )
		{
			if ( realpath( argv[i], fullpath ) == NULL )
			{
				fprintf( stderr, "Failed to resolve the path of file %s.\n", argv[i] ) ;
				exit( 1 ) ;
			}
			fprintf( fpOut, "%s%c", fullpath, c ) ;
		}
		else
			fprintf( fpOut, "%s%c", argv[i], c ) ;
	}
	free( fullpath ) ;
	scaffold.Output( fpOut, alignments ) ;
	return 0 ;
}
Example #8
0
int main( int argc, char *argv[] )
{
	int i, j, k ;	
	FILE *fp ;
	char buffer[10100] ;
	std::vector<std::string> fields ;
	int binSize = 50 ;
	const char *backboneName = NULL ;

	// the compatilibity between the alignment and allele
	// The likelihood of this read from this allele
	struct _compatible **compatibility ; 
	int **snpAllele ; // whether a snp showed up in the allele.

	Alignments alignments ;
	alignments.Open( argv[2] ) ;

	for ( i = 3 ; i < argc ; ++i )
	{
		if ( !strcmp( argv[i], "-b" ) )
		{
			backboneName = argv[i + 1] ;
			alignments.OnlyChrom( backboneName ) ;
			++i ;
		}
		else
		{
			fprintf( stderr, "Unknown argument %s.\n", argv[i] ) ;
			exit( 1 ) ;
		}
	}

	// Parse the files associate with the snps
	// Firstly, read in the snp list have the information
	sprintf( buffer, "%s.snp", argv[1] ) ;
	fp = fopen( buffer, "r" ) ;	
	k = 0 ;
	while ( fgets( buffer, sizeof( buffer ), fp ) )
	{
		Split( buffer, '\t', fields ) ;
		if ( backboneName && strcmp( fields[2].c_str(), backboneName ) )
			continue ;
		snpNameToId[ fields[0] ] = k ;
		struct _snpInfo info ;
		info.type = fields[1][0] ;
		if ( info.type == 'd' )
		{
			info.position = atoi( fields[3].c_str() ) ;
			info.length = atoi( fields[4].c_str() ) ;
		}
		else if ( info.type == 'i' )
		{
			info.position = atoi( fields[3].c_str() ) ;
			info.length = strlen( fields[4].c_str() ) ;
		}
		else
		{
			info.position = atoi( fields[3].c_str()	) ; // notice that the snp file is 0-based index.
			info.length = 1 ;
			info.nucleotide = fields[4][0] ;
		}
		snpInfo.push_back( info ) ;
		std::vector< int > tmpList ;
		snpLink.push_back( tmpList ) ;
		for ( int p = 0 ; p < info.length ; ++p )
		{
			if ( info.type != 'i' || p == 0 )
			{
				if ( positionToSnp.find( info.position + p ) == positionToSnp.end() )
				{
					positionToSnp[ info.position + p] = tmpList ;
				}
				positionToSnp[ info.position + p].push_back( k ) ;
			}
		}
		++k ;
	} 
	fclose( fp ) ;
	// Read in the link file. Determine the id of alleles and the association
	// of alleles and snps.
	// TODO: obtain the length of each allele and take the length into account in the statistical model
	// Add the id for the backbound
	int backboneLength = 0 ;
	sprintf( buffer, "%s_backbone.fa", argv[1] ) ;
	fp = fopen( buffer, "r" ) ;
	/*for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i )
		;
	buffer[i] = '\0' ;
	std::string backboneName( buffer + 1 ) ;
	alleleNameToId[ backboneName ] = 0 ;
	alleleIdToName.push_back( backboneName ) ;*/
	bool start = false ;
	while ( fgets( buffer, sizeof( buffer ), fp ) )
	{
		if ( buffer[0] == '>' )
		{
			for ( i = 1 ; buffer[i] && buffer[i] != ' ' && buffer[i] != '\n' ; ++i )
				;
			buffer[i] = '\0' ;
			if ( !strcmp( backboneName, buffer + 1 ) )
			{
				start = true ;
			}
			else if ( start )
				break ;
		}
		if ( start && buffer[0] != '>' )
		{
			int len = strlen( buffer ) ;
			if ( buffer[len - 1 ] == '\n' )
				backboneLength += len - 1 ;
			else
				backboneLength += len ;
		}
	}
	fclose( fp ) ;
	/*k = 0 ;
	if ( k == 0 )
	{
		std::vector<int> tmpList ;
		alleleSnpList.push_back( tmpList ) ;
		alleleLength.push_back( backboneLength ) ;
	}*/


	// scanning the link file
	sprintf( buffer, "%s.link", argv[1] ) ;
	fp = fopen( buffer, "r" ) ;
	k = 0 ; 
	while ( fgets( buffer, sizeof( buffer ), fp ) )
	{
		std::vector<std::string> tmpFields ;
		Split( buffer, '\t', tmpFields ) ;
		// skip the snps from other backbones
		if ( snpNameToId.find( tmpFields[0] ) == snpNameToId.end() )
			continue ;

		int snpId = snpNameToId[ tmpFields[0] ] ;
		Split( tmpFields[1].c_str(), ' ', fields ) ;
		int size = fields.size() ;
	
		for ( i = 0 ; i < size ; ++i )
		{
			if ( alleleNameToId.find( fields[i] ) == alleleNameToId.end() )
			{
				//printf( "%s %d\n", fields[i].c_str(), k ) ;
				alleleNameToId[ fields[i] ] = k ;
				alleleIdToName.push_back( fields[i] ) ;
				std::vector<int> tmpList ;
				alleleSnpList.push_back( tmpList ) ;
				alleleLength.push_back( backboneLength ) ;
				++k ;
			}

			int alleleId = alleleNameToId[ fields[i] ] ;
			//if ( snpId == 118 )
			//	printf( "%s: %s %d\n", tmpFields[0].c_str(), fields[i].c_str(), alleleId ) ;
			snpLink[ snpId ].push_back( alleleId ) ;
			alleleSnpList[ alleleId ].push_back( snpId ) ;
			if ( snpInfo[ snpId ].type == 'd' )
			{
				alleleLength[ alleleId ] -= snpInfo[ snpId ].length ;
			}
			else if ( snpInfo[ snpId ].type == 'i' )
			{
				alleleLength[ alleleId ] += snpInfo[ snpId ].length ;
			}
		}
	} 
	fclose( fp ) ;
	int numOfAllele = alleleIdToName.size() ;
	int numOfSnps = snpLink.size() ;
	snpAllele = new int* [numOfSnps] ;
	for ( i = 0 ; i < numOfSnps ; ++i )
	{
		snpAllele[i] = new int[numOfAllele] ;
		memset( snpAllele[i], 0, sizeof( int ) * numOfAllele ) ;
	}
	for ( i = 0 ; i < numOfSnps ; ++i )
	{
		int size = snpLink[i].size() ;
		for ( j = 0 ; j < size ; ++j )
		{
			snpAllele[i][ snpLink[i][j] ] = 1 ;
		}
	}

	// Compute the compatbility score for each alignment and the allele
 	// Get the number of alignment
	int numOfAlignments = 0 ;
	while ( alignments.Next() )
		++numOfAlignments ;
		
	alignments.Rewind() ;

	compatibility = new struct _compatible*[numOfAlignments] ;	
	for ( i = 0 ; i < numOfAlignments ; ++i )
	{
		compatibility[i] = new struct _compatible[ numOfAllele ] ;
		for ( j = 0 ; j < numOfAllele ; ++j )
		{
			compatibility[i][j].value = 0 ;//-log( (double)alleleLength[j] ) / log( 10.0 );
		}
	}

	i = 0 ; 
	bool *snpHit = new bool[ numOfSnps ] ;
	while ( alignments.Next() )
	{
		struct _pair coord = alignments.segments[0] ;
		alignmentCoords.push_back( coord ) ;
		memset( snpHit, 0, sizeof( bool ) * numOfSnps ) ;
		Split( alignments.GetFieldZ( "Zs" ), ',', fields ) ;
		int size = fields.size() ;
		for ( k = 0 ; k < size ; ++k )
		{
			std::vector<std::string> subfields ;
			Split( fields[k].c_str(), '|', subfields ) ;
			int snpId = snpNameToId[ subfields[2] ] ;	
			snpHit[ snpId ] = true ;
		}
		
		for ( k = coord.a ; k <= coord.b ; ++k )
		{
			int size = positionToSnp[k].size() ;
			for ( int l = 0 ; l < size ; ++l )
			{
				// if this SNP is hit. Then other allele don't have this snp 
				// will deduct its likelihood
				//TODO: the deduction can be based on the quality score of the read
				int tag = 0 ;
				int snpId = positionToSnp[k][l] ;

				if ( snpHit[ snpId ] )
				{
					tag = 0 ;
				}
				else
				{
				// if this SNP is not hit, then every allele containing this snp 
				// will deduct its likelihood
					tag = 1 ;
				}
				for ( j = 0 ; j < numOfAllele ; ++j )
				{
					if ( snpAllele[ snpId  ][j] == tag )
					{
						int v = -2 ;
						//if ( snpInfo[ snpId ].type == 'd' || snpInfo[ snpId ].type == 'i' )
						//	v = -4 * snpInfo[ snpId ].length ;	
						if ( snpInfo[ snpId ].type == 'd' && snpInfo[ snpId ].position < k 
							&& k != coord.a )
						{
							// The penality has already been subtracted.
							v = 0 ;
						}
						compatibility[i][j].value += v ;
						/*if ( i == 8 && j == 78 )
						{
							printf( "Bad snp %d: %d %d\n", tag, k, positionToSnp[k][l] ) ;
						}*/
					}
				}
			}
		}
		++i ;
	}
	//printf( "%d %d\n", numOfAlignments, numOfAllele ) ;
	// Now, let's consider every pair of alleles, and compute its log likelihood 
	double **logLikelihood ;
	logLikelihood = new double *[ numOfAllele] ;
	for ( j = 0 ; j < numOfAllele ; ++j )
	{
		logLikelihood[j] = new double[ numOfAllele ] ;
		//memset( logLikelihood[j], 0, sizeof( double ) * numOfAllele ) ;
		for ( k = 0 ; k < numOfAllele ; ++k )
			logLikelihood[j][k] = 0 ;
	}

	int prevBin = -1 ;
	double assignJBin = 0 ;
	double assignKBin = 0 ;
	for ( j = 0 ; j < numOfAllele ; ++j )
	{
		for ( k = j ; k < numOfAllele ; ++k )
		{
			double binAdjust = 0 ;
			double averageRead = ( (double)numOfAlignments ) / (double)( alleleLength[j] + alleleLength[k] ) * binSize ;
			for ( i = 0 ; i < numOfAlignments ; ++i )
			{
				double vj = compatibility[i][j].value ;
				double vk = compatibility[i][k].value ;
				double weightJ = 0, weightK = 0 ;
				if ( vj == vk )
				{
					weightJ = weightK = 0.5 ;
				}
				else if ( vj == vk + 2 )
				{
					if ( vj == 0 )
					{
						weightJ = 1 ;
					}
					else
					{
						weightJ = 0.99 ;
						weightK = 0.01 ;
					}
				}
				else if ( vk == vj + 2 )
				{
					if ( vk == 0 )
					{
						weightK = 1 ;
					}
					else
					{
						weightJ = 0.01 ;
						weightK = 0.99 ;
					}
				}
				else
				{
					if ( vk > vj )
						weightK = 1 ;
					else
						weightJ = 1 ;
				}
				
				double l = weightJ * compatibility[i][j].value + weightK * compatibility[i][k].value ;
				if ( alignmentCoords[i].a / binSize != prevBin )
				{
					if ( prevBin != -1 && 
						( assignJBin > averageRead + 4 * sqrt( averageRead ) 
							|| assignKBin > averageRead + 4 * sqrt( averageRead ) ) )
					{
						//if ( j == 8 && k == 78 )
						//	printf( "%lf: %lf %lf %d %d\n", averageRead, assignJBin, assignKBin, alleleLength[j], alleleLength[k] ) ;
						binAdjust -= 4 ;
					}
					prevBin = alignmentCoords[i].a / binSize ;
					assignJBin = 0 ;
					assignKBin = 0 ;
				}
				assignJBin += weightJ ;
				assignKBin += weightK ;
				
				/*if ( j == 8 && k == 78 && l < 0 )
				{
					printf( "Bad alignment %d (%s %s). %lf %lf: %lf\n", i,
						alleleIdToName[j].c_str(), alleleIdToName[k].c_str(),
						 compatibility[i][j].value, compatibility[i][k].value, l ) ;
				}*/
				logLikelihood[j][k] += l ;
			}
			logLikelihood[j][k] += ( -log( (double)alleleLength[j] ) / log(10.0 ) -
							log( (double)alleleLength[k] ) / log(10.0) ) ;
			logLikelihood[j][k] += binAdjust ;
		}
	}
	
	// Find the result
	double max ;
	int maxj = -1 ;
	int maxk = -1 ;
	std::vector< struct _result > results ;
	for ( j = 0 ; j < numOfAllele ; ++j )
	{
		for ( k = j ; k < numOfAllele ; ++k )
		{
			if ( maxj == -1 || logLikelihood[j][k] > max )
			{
				maxj = j ;
				maxk = k ;
				max = logLikelihood[j][k] ;
			}
			struct _result r ;
			r.a = j ;
			r.b = k ;
			r.logLikelihood = logLikelihood[j][k] ;
			results.push_back( r ) ;
		}
	}

	//printf( "%s %s %lf\n", alleleIdToName[ maxj ].c_str(), alleleIdToName[ maxk ].c_str(), max) ;
	//printf( "%lf\n", logLikelihood[124][128] ) ;	
	
	if ( results.size() == 0 )
	{
		printf( "-1 -1 -1\n" ) ;
		exit( 1 ) ;
	}
	std::sort( results.begin(), results.end(), CompResult ) ;	
	i = 0 ;
	printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), 
			results[i].logLikelihood ) ;
	k = results.size() ;
	for ( i = 1 ; i < k ; ++i )
	{
		if ( results[i].logLikelihood != results[0].logLikelihood )
			break ;
		printf( "%s %s %lf\n", alleleIdToName[ results[i].a ].c_str(), alleleIdToName[ results[i].b ].c_str(), 
			results[i].logLikelihood ) ;
	}
	return 0 ;
}