예제 #1
0
extern "C" void *vmd_mpi_parallel_for_scheduler(void *voidparms) {
  parallel_for_parms *parfor = (parallel_for_parms *) voidparms;

  // Run the for loop management code on node zero.
  // Do the work on all the other nodes...
#if defined(VMDTHREADS)
  int i;
  wkf_tasktile_t curtile;
  while (wkf_shared_iterator_next_tile(&parfor->iter, 1, &curtile) != WKF_SCHED_DONE) {
    i = curtile.start;
#else
  int i;
  for (i=parfor->loop.start; i<parfor->loop.end; i++) {
#endif
    int reqnode;
    MPI_Status rcvstat;
    MPI_Recv(&reqnode, 1, MPI_INT, MPI_ANY_SOURCE, VMD_MPI_TAG_FOR_REQUEST, 
             MPI_COMM_WORLD, &rcvstat); 
    MPI_Send(&i, 1, MPI_INT, reqnode, VMD_MPI_TAG_FOR_REQUEST, 
             MPI_COMM_WORLD);
  }

  // tell all nodes we're done with all of the work
  int node;
  for (node=1; node<parfor->numnodes; node++) {
    int reqnode;
    MPI_Status rcvstat;
    MPI_Recv(&reqnode, 1, MPI_INT, MPI_ANY_SOURCE, VMD_MPI_TAG_FOR_REQUEST, 
             MPI_COMM_WORLD, &rcvstat); 

    i=-1; // indicate that the for loop is completed
    MPI_Send(&i, 1, MPI_INT, reqnode, VMD_MPI_TAG_FOR_REQUEST, 
             MPI_COMM_WORLD);
  }

  return NULL;
}

#endif



int text_cmd_parallel(ClientData cd, Tcl_Interp *interp, int argc, const char *argv[]) {
  VMDApp *app = (VMDApp *)cd;

  if(argc<2) {
    Tcl_SetResult(interp,
      (char *)
      "Parallel job query commands:\n"
      "  parallel nodename\n"
      "  parallel noderank\n"
      "  parallel nodecount\n"
      "Parallel collective operations (all nodes MUST participate):\n"
      "  parallel allgather <object>\n"
      "  parallel allreduce <tcl reduction proc> <object>\n"
      "  parallel barrier\n"
      "  parallel for <startcount> <endcount> <tcl callback proc> <user data>",
      TCL_STATIC);
    return TCL_ERROR;
  }

  // XXX hack to make Swift/T cooperate with VMD when using VMD's MPI
  // communicator
  if (!strcmp(argv[1], "swift_clone_communicator")) {
    swift_mpi_init(interp);
    return TCL_OK;
  }

  // return the MPI node name
  if (!strcmp(argv[1], "nodename")) {
    Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
    Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj(app->par_name(), strlen(app->par_name())));
    Tcl_SetObjResult(interp, tcl_result);
    return TCL_OK;
  }

  // return the MPI node rank
  if (!strcmp(argv[1], "noderank")) {
    Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
    Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewIntObj(app->par_rank()));
    Tcl_SetObjResult(interp, tcl_result);
    return TCL_OK;
  }

  // return the MPI node count
  if (!strcmp(argv[1], "nodecount")) {
    Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
    Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewIntObj(app->par_size()));
    Tcl_SetObjResult(interp, tcl_result);
    return TCL_OK;
  }

  // execute an MPI barrier
  if(!strupncmp(argv[1], "barrier", CMDLEN) && argc==2) {
    app->par_barrier();
    return TCL_OK;
  }

  // Execute a parallel for loop across all nodes
  //
  //  parallel for <startcount> <endcount> <callback proc> <user data>",
  //
  if(!strupncmp(argv[1], "for", CMDLEN)) {
    int isok = (argc == 6);
    int N = app->par_size();
    int start, end;

    if (Tcl_GetInt(interp, argv[2], &start) != TCL_OK ||
        Tcl_GetInt(interp, argv[3], &end) != TCL_OK) {
      isok = 0;
    }

    //
    // If there's only one node, short-circuit the parallel for
    //
    if (N == 1) {
      if (!isok) {
        Tcl_SetResult(interp, (char *) "invalid parallel for, missing parameter", TCL_STATIC);
        return TCL_ERROR;
      }

      // run for loop on one node...
      int i;
      for (i=start; i<=end; i++) { 
        char istr[128];
        sprintf(istr, "%d", i);
        if (Tcl_VarEval(interp, argv[4], " ", istr, " {",
                        argv[5], "} ", NULL) != TCL_OK) {
          Tcl_SetResult(interp, (char *) "error occured during parallel for", TCL_STATIC);
        }
      }

      return TCL_OK;
    }

#if defined(VMDMPI)
    int allok = 0;

    // Check all node result codes before we continue with the reduction
    MPI_Allreduce(&isok, &allok, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);

    // XXX we may want to verify that all nodes are going to call the same
    // reduction proc here before continuing further.

    if (!allok) {
      Tcl_SetResult(interp, (char *) "invalid parallel for, missing parameter on one or more nodes", TCL_STATIC);
      return TCL_ERROR;
    }

    // Run the for loop management code on node zero.
    // Do the work on all the other nodes...
    int i;
    if (app->par_rank() == 0) {
      // use multithreaded code path
      parallel_for_parms parfor;
      memset(&parfor, 0, sizeof(parfor));
      parfor.numnodes = N;
      parfor.loop.start=start;
      parfor.loop.end=end+1;
      wkf_shared_iterator_init(&parfor.iter);
      wkf_shared_iterator_set(&parfor.iter, &parfor.loop);

#if defined(VMDTHREADS)
      // run the MPI scheduler in a new child thread
      wkf_thread_t pft;
      wkf_thread_create(&pft, vmd_mpi_parallel_for_scheduler, &parfor);

      // run the Tcl in the main thread
      wkf_tasktile_t curtile;
      while (wkf_shared_iterator_next_tile(&parfor.iter, 1, &curtile) != WKF_SCHED_DONE) {
        i = curtile.start;
        char istr[128];
        sprintf(istr, "%d", i);
        if (Tcl_VarEval(interp, argv[4], " ", istr, " {",
                        argv[5], "} ", NULL) != TCL_OK) {
          Tcl_SetResult(interp, (char *) "error occured during parallel for", TCL_STATIC);
        }
      }

      // join up with the MPI scheduler thread
      wkf_thread_join(pft, NULL);
#else
      // if no threads, node zero only runs the scheduler and doesn't do work
      vmd_mpi_parallel_for_scheduler(&parfor);
#endif

      wkf_shared_iterator_destroy(&parfor.iter);
    } else {
      char istr[128];
      int done=0;
      int mynode=app->par_rank();
      while (!done) {
        MPI_Send(&mynode, 1, MPI_INT, 0, VMD_MPI_TAG_FOR_REQUEST, 
                 MPI_COMM_WORLD);

        MPI_Status rcvstat;
        MPI_Recv(&i, 1, MPI_INT, MPI_ANY_SOURCE, VMD_MPI_TAG_FOR_REQUEST, 
                 MPI_COMM_WORLD, &rcvstat); 
        if (i == -1) {
          done = 1;
        } else {
          sprintf(istr, "%d", i);
          if (Tcl_VarEval(interp, argv[4], " ", istr, " {",
                          argv[5], "} ", NULL) != TCL_OK) {
            Tcl_SetResult(interp, (char *) "error occured during parallel for", TCL_STATIC);
          }
        }
      }
    }
#endif

    return TCL_OK;
  }

  // Execute an allgather producing a Tcl list of the per-node contributions
  //
  // parallel allgather <object>
  //
  if(!strupncmp(argv[1], "allgather", CMDLEN)) {
    int isok = (argc == 3);

#if defined(VMDMPI)
    int allok = 0;
    int i;

    // Check all node result codes before we continue with the gather
    MPI_Allreduce(&isok, &allok, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);

    if (!allok) {
      Tcl_SetResult(interp, (char *) "invalid parallel gather, missing parameter on one or more nodes", TCL_STATIC);
      return TCL_ERROR;
    }

    // Collect parameter size data so we can allocate result buffers
    // before executing the gather
    int *szlist = new int[app->par_size()];
    szlist[app->par_rank()] = strlen(argv[2])+1;

#if defined(USE_MPI_IN_PLACE)
    // MPI >= 2.x implementations (e.g. NCSA/Cray Blue Waters)
    MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT,
                  &szlist[0], 1, MPI_INT, MPI_COMM_WORLD);
#else
    // MPI 1.x
    MPI_Allgather(&szlist[app->par_rank()], 1, MPI_INT,
                  &szlist[0], 1, MPI_INT, MPI_COMM_WORLD);
#endif

    int totalsz = 0;
    int *displist = new int[app->par_size()];
    for (i=0; i<app->par_size(); i++) {
      displist[i]=totalsz;
      totalsz+=szlist[i];
    }

    char *recvbuf = new char[totalsz];
    memset(recvbuf, 0, totalsz);

    // Copy this node's data into the correct array position
    strcpy(&recvbuf[displist[app->par_rank()]], argv[2]);

    // Perform the parallel gather 
#if defined(USE_MPI_IN_PLACE)
    // MPI >= 2.x implementations (e.g. NCSA/Cray Blue Waters)
    MPI_Allgatherv(MPI_IN_PLACE, szlist[app->par_rank()], MPI_BYTE,
                   &recvbuf[0], szlist, displist, MPI_BYTE, MPI_COMM_WORLD);
#else
    // MPI 1.x
    MPI_Allgatherv(&recvbuf[displist[app->par_rank()]], szlist[app->par_rank()], MPI_BYTE,
                   &recvbuf[0], szlist, displist, MPI_BYTE, MPI_COMM_WORLD);
#endif

    // Build Tcl result from the array of results
    Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
    for (i=0; i<app->par_size(); i++) {
      Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj(&recvbuf[displist[i]], szlist[i]-1));
    }
    Tcl_SetObjResult(interp, tcl_result);

    delete [] recvbuf;
    delete [] displist;
    delete [] szlist;
    return TCL_OK;
#else
    if (!isok) {
      Tcl_SetResult(interp, (char *) "invalid parallel gather, missing parameter on one or more nodes", TCL_STATIC);
      return TCL_ERROR;
    }

    Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL);
    Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj(argv[2], strlen(argv[2])));
    Tcl_SetObjResult(interp, tcl_result);
    return TCL_OK;
#endif
  }


  //
  // Execute an All-Reduce across all of the nodes.
  // The user must provide a Tcl proc that performs the appropriate reduction
  // operation for a pair of data items, resulting in a single item.
  // Since the user may pass floating point data or perform reductions 
  // that give very slightly different answers depending on the order of
  // operations, the architecture or the host, or whether reductions on 
  // a given host are occuring on the CPU or on a heterogeneous accelerator 
  // or GPU of some kind, we must ensure that all nodes get a bit-identical
  // result.  When heterogeneous accelerators are involved, we can really 
  // only guarantee this by implementing the All-Reduce with a 
  // Reduce-then-Broadcast approach, where the reduction collapses the
  // result down to node zero, which then does a broadcast to all peers. 
  //
  // parallel allreduce <tcl reduction proc> <object>
  //
  if(!strupncmp(argv[1], "allreduce", CMDLEN)) {
    int isok = (argc == 4);
    int N = app->par_size();

    //
    // If there's only one node, short-circuit the full parallel reduction
    //
    if (N == 1) {
      if (!isok) {
        Tcl_SetResult(interp, (char *) "invalid parallel reduction, missing parameter", TCL_STATIC);
        return TCL_ERROR;
      }

      // return our result, no other reduction is necessary
      Tcl_SetObjResult(interp, Tcl_NewStringObj(argv[3], strlen(argv[3])));
      return TCL_OK;
    }

#if 1 && defined(VMDMPI)
    //
    // All-Reduce implementation based on a ring reduction followed by a 
    // broadcast from node zero.  This implementation gaurantees strict
    // ordering and will properly handle the case where one or more nodes
    // perform their reduction with slightly differing floating point 
    // rounding than others (e.g. using GPUs, heterogeneous nodes, etc),
    // and it works with any number of nodes.  While NOT latency-optimal,
    // this implementation is close to bandwidth-optimal which is helpful
    // for workstation clusters on non-switched networks or networks with
    // switches that cannot operate in a fully non-blocking manner.
    //
    int allok = 0;

    // Check all node result codes before we continue with the reduction
    MPI_Allreduce(&isok, &allok, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);

    // XXX we may want to verify that all nodes are going to call the same
    // reduction proc here before continuing further.

    if (!allok) {
      Tcl_SetResult(interp, (char *) "invalid parallel reduction, missing parameter on one or more nodes", TCL_STATIC);
      return TCL_ERROR;
    }

    // copy incoming data into initial "result" object
    Tcl_Obj *resultobj = Tcl_NewStringObj((const char *) argv[3], strlen(argv[3])+1);

    // A ring-based all-reduce implementation which should be 
    // close to bandwidth-optimal, at the cost of additional latency.
    int src=app->par_rank(); // src node is this node
    int Ldest = (N + src + 1) % N; // compute left peer
    int Rdest = (N + src - 1) % N; // compute right peer
    MPI_Status status;

    if (src != 0) {
      int recvsz = 0;

      // Post blocking receive for data size
      MPI_Recv(&recvsz, 1, MPI_INT, Ldest,
               VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD, &status);

      // Allocate or resize receive buffer 
      char * recvbuf = (char *) malloc(recvsz);

      // Post non-blocking receive for data
      MPI_Recv(recvbuf, recvsz, MPI_BYTE, Ldest,
               VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD, &status);

      // Perform reduction
      // Perform the reduction operation on our existing and incoming data.
      // We build a Tcl command string with the user-defined proc, this 
      // node's previous resultand, and the incoming data, and evaluate it.
      if (Tcl_VarEval(interp, argv[2], " ", 
                      Tcl_GetString(resultobj), " ", 
                      recvbuf, NULL) != TCL_OK) {
        printf("Error occured during reduction!\n");    
      }

      // Prep for next reduction step.  Set result object to result of
      // the latest communication/reduction phase.
      resultobj = Tcl_GetObjResult(interp);

      // Free the receive buffer
      free(recvbuf);
    } 
  
    //
    // All nodes
    //  
    char *sendbuf = Tcl_GetString(resultobj);
    int sendsz = strlen(sendbuf)+1;

    // Post blocking send for data size
    MPI_Send(&sendsz, 1, MPI_INT, Rdest,
             VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD);

    // Post blocking send for data
    MPI_Send(sendbuf, sendsz, MPI_BYTE, Rdest,
             VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD);

    if (src == 0) {
      int recvsz = 0;

      // Post blocking receive for data size
      MPI_Recv(&recvsz, 1, MPI_INT, Ldest,
               VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD, &status);

      // Allocate or resize receive buffer 
      char * recvbuf = (char *) malloc(recvsz);

      // Post non-blocking receive for data
      MPI_Recv(recvbuf, recvsz, MPI_BYTE, Ldest,
               VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD, &status);

      // Perform reduction
      // Perform the reduction operation on our existing and incoming data.
      // We build a Tcl command string with the user-defined proc, this 
      // node's previous result and the incoming data, and evaluate it.
      if (Tcl_VarEval(interp, argv[2], " ", 
                      Tcl_GetString(resultobj), " ", 
                      recvbuf, NULL) != TCL_OK) {
        printf("Error occured during reduction!\n");    
      }

      // Prep for next reduction step.  Set result object to result of
      // the latest communication/reduction phase.
      resultobj = Tcl_GetObjResult(interp);

      // Free the receive buffer
      free(recvbuf);
    } 

    //
    // Broadcast final result from root to peers
    //
    if (src == 0) {
      // update send buffer for root node before broadcast
      sendbuf = Tcl_GetString(resultobj);
      sendsz = strlen(sendbuf)+1;
      MPI_Bcast(&sendsz, 1, MPI_INT, 0, MPI_COMM_WORLD);
      MPI_Bcast(sendbuf, sendsz, MPI_BYTE, 0, MPI_COMM_WORLD);
    } else {
      int recvsz = 0;
      MPI_Bcast(&recvsz, 1, MPI_INT, 0, MPI_COMM_WORLD);

      // Allocate or resize receive buffer 
      char * recvbuf = (char *) malloc(recvsz);

      MPI_Bcast(recvbuf, recvsz, MPI_BYTE, 0, MPI_COMM_WORLD);

      // Set the final Tcl result if necessary
      Tcl_SetObjResult(interp, Tcl_NewStringObj(recvbuf, recvsz-1));

      // Free the receive buffer
      free(recvbuf);
    }

    return TCL_OK;

#elif defined(VMDMPI)

    //
    // Power-of-two-only hypercube/butterfly/recursive doubling 
    // All-Reduce implementation.  This implementation can't be used
    // in the case that we have either a non-power-of-two node count or
    // in the case where we have heterogeneous processing units that may
    // yield different floating point rounding.  For now we leave this
    // implementation in the code for performance comparisons until we work
    // out the changes necessary to make it closer to bandwidth-optimal,
    // heterogeneous-safe, and non-power-of-two capable.
    //
    int allok = 0;
    int i;

    // Check all node result codes before we continue with the reduction
    MPI_Allreduce(&isok, &allok, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);

    // XXX we may want to verify that all nodes are going to call the same
    // reduction proc here before continuing further.

    if (!allok) {
      Tcl_SetResult(interp, (char *) "invalid parallel reduction, missing parameter on one or more nodes", TCL_STATIC);
      return TCL_ERROR;
    }

    // Calculate number of reduction phases required
    int log2N;
    for (log2N=0; N>1; N>>=1) {
      log2N++;

      // XXX bail out of we don't have a power-of-two node count, 
      //     at least until we implement 3-2 reduction phases
      if ((N & 1) && (N > 1)) {
        Tcl_SetResult(interp, (char *) "parallel allreduce only allowed for even power-of-two node count", TCL_STATIC);
        return TCL_ERROR;
      }
    }
    N = app->par_size();

    // copy incoming data into initial "result" object
    Tcl_Obj *resultobj = Tcl_NewStringObj((const char *) argv[3], strlen(argv[3])+1);

    // An all-reduce tree with hypercube connectivity with 
    // log2(N) communication/reduction phases.  At each phase, we compute
    // the peer/destination node we will communicate with using an XOR of
    // our node ID with the current hypercube dimension.  If we have an
    // incomplete hypercube topology (e.g. non-power-of-two node count), 
    // we have to do special 3-2 communication rounds (not implemented yet).
    // The current implementation requires that all existing nodes 
    // participate, and that they contribute a valid data item.
    // If we wish to support reductions where a node may not contribute,
    // we would need to handle that similarly to a peer node that doesn't 
    // exist, but we would likely determine this during the parameter length
    // exchange step.
    int src=app->par_rank(); // src node is this node
    for (i=0; i<log2N; i++) {
      int mask = 1 << i;     // generate bitmask to use in the XOR
      int dest = src ^ mask; // XOR src node with bitmask to find dest node
      Tcl_Obj *oldresultobj = resultobj; // track old result 

      // Check to make sure dest node exists for non-power-of-two 
      // node counts (an incomplete hypercube).  If not, skip to the next
      // communication/reduction phase.
      if (dest < N) {
        char *sendbuf = Tcl_GetString(oldresultobj);
        int sendsz = strlen(sendbuf)+1;
        int recvsz = 0;
        MPI_Request handle;
        MPI_Status status;

        //
        // Exchange required receive buffer size for data exchange with peer
        //

        // Post non-blocking receive for data size
        MPI_Irecv(&recvsz, 1, MPI_INT, dest,
                  VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD, &handle);

        // Post blocking send for data size
        MPI_Send(&sendsz, 1, MPI_INT, dest,
                 VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD);

        // Wait for non-blocking receive of data size to complete
        MPI_Wait(&handle, &status); 

// printf("src[%d], dest[%d], value '%s', recvsz: %d\n", src, dest, sendbuf, recvsz);

        // Allocate or resize receive buffer 
        char * recvbuf = (char *) malloc(recvsz);

        //
        // Exchange the data payload
        // 

        // Post non-blocking receive for data
        MPI_Irecv(recvbuf, recvsz, MPI_BYTE, dest,
                  VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD, &handle);
        
        // Post blocking send for data
        MPI_Send(sendbuf, sendsz, MPI_BYTE, dest,
                 VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD);

        // Wait for receive of data
        MPI_Wait(&handle, &status); 

        // Perform the reduction operation on our existing and incoming data.
        // We build a Tcl command string with the user-defined proc, this 
        // node's previous result and the incoming data, and evaluate it.
        if (Tcl_VarEval(interp, argv[2], " ", 
                        sendbuf, " ", recvbuf, NULL) != TCL_OK) {
          printf("Error occured during reduction!\n");    
        }

        // Free the receive buffer
        free(recvbuf);

        // Prep for next reduction step.  Set result object to result of
        // the latest communication/reduction phase.
        resultobj = Tcl_GetObjResult(interp);
      }
    }

    // Set the final Tcl result if necessary
    Tcl_SetObjResult(interp, resultobj);

    return TCL_OK;
#endif
  }
예제 #2
0
// setup and launch bond search threads
int vmd_bondsearch_thr(const float *pos, const float *radii,
                       GridSearchPairlist * head, 
                       int totb, int **boxatom, 
                       int *numinbox, int **nbrlist, int maxpairs, 
                       float pairdist) {
  int i;
  bondsearchthrparms *parms;
  wkf_thread_t * threads;
  wkf_mutex_t pairlistmutex; ///< guards pairlist
  wkf_mutex_init(&pairlistmutex); // init mutex before use

  int numprocs = wkf_thread_numprocessors();

  /* allocate array of threads */
  threads = (wkf_thread_t *) calloc(numprocs * sizeof(wkf_thread_t), 1);

  /* allocate and initialize array of thread parameters */
  parms = (bondsearchthrparms *) malloc(numprocs * sizeof(bondsearchthrparms));
  for (i=0; i<numprocs; i++) {
    parms[i].threadid = i;
    parms[i].threadcount = numprocs;
    parms[i].pairlistmutex = &pairlistmutex;
    parms[i].head = NULL;
    parms[i].pos = (float *) pos;
    parms[i].radii = (float *) radii;
    parms[i].totb = totb;
    parms[i].boxatom = boxatom;
    parms[i].numinbox = numinbox;
    parms[i].nbrlist = nbrlist;  
    parms[i].maxpairs = maxpairs;
    parms[i].pairdist = pairdist;
  }

#if defined(VMDTHREADS)
  /* spawn child threads to do the work */
  for (i=0; i<numprocs; i++) {
    wkf_thread_create(&threads[i], bondsearchthread, &parms[i]);
  }

  /* join the threads after work is done */
  for (i=0; i<numprocs; i++) {
    wkf_thread_join(threads[i], NULL);
  }
#else
  bondsearchthread(&parms[0]); // single-threaded code
#endif

  // assemble final pairlist from sublists
  for (i=0; i<numprocs; i++) {
    if (parms[i].head != NULL) {
      GridSearchPairlist *tmp = head->next;
      head->next = parms[i].head;
      parms[i].head->next = tmp;
    }
  }

  wkf_mutex_destroy(&pairlistmutex); // destroy mutex when finished

  /* free thread parms */
  free(parms);
  free(threads);

  return 0;
}