示例#1
0
文件: query.c 项目: darashi/groonga
static void
scan_keyword(snip_cond *sc, grn_str *str, grn_id section,
             grn_operator op, grn_select_optarg *optarg,
             int *found, int *score)
{
  int tf;
  int w = 1;
  for (tf = 0; ; tf++) {
    grn_bm_tunedbm(sc, str, 0);
    if (sc->stopflag == SNIPCOND_STOP) { break; }
  }
  if (optarg->vector_size) {
    if (!optarg->weight_vector) {
      w = optarg->vector_size;
    } else if (section) {
      w = (section <= optarg->vector_size ?
                      optarg->weight_vector[section - 1] : 0);
    }
  }
  switch (op) {
  case GRN_OP_OR :
    if (tf) {
      *found = 1;
      *score += w * tf;
    }
    break;
  case GRN_OP_AND :
    if (tf) {
      *score += w * tf;
    } else {
      *found = 0;
    }
    break;
  case GRN_OP_BUT :
    if (tf) {
      *found = 0;
    }
    break;
  case GRN_OP_ADJUST :
    *score += w * tf;
  default :
    break;
  }
}
示例#2
0
文件: snip.c 项目: ry05cga/groonga
grn_rc
grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int string_len,
              unsigned int *nresults, unsigned int *max_tagged_len)
{
  size_t i;
  int f = GRN_STR_WITH_CHECKS|GRN_STR_REMOVEBLANK;
  if (!snip || !string || !nresults || !max_tagged_len) {
    return GRN_INVALID_ARGUMENT;
  }
  GRN_API_ENTER;
  exec_clean(ctx, snip);
  *nresults = 0;
  snip->nstr = grn_string_open(ctx, string, string_len, snip->normalizer, f);
  if (!snip->nstr) {
    exec_clean(ctx, snip);
    GRN_LOG(ctx, GRN_LOG_ALERT, "grn_string_open on grn_snip_exec failed !");
    GRN_API_RETURN(ctx->rc);
  }
  for (i = 0; i < snip->cond_len; i++) {
    grn_bm_tunedbm(ctx, snip->cond + i, snip->nstr, snip->flags);
  }

  {
    _snip_tag_result *tag_result = snip->tag_result;
    _snip_result *snip_result = snip->snip_result;
    size_t last_end_offset = 0, last_last_end_offset = 0;
    unsigned int unfound_cond_count = snip->cond_len;

    *max_tagged_len = 0;
    while (1) {
      size_t tagged_len = 0, last_tag_end = 0;
      int_least8_t all_stop = 1, found_cond = 0;
      snip_result->tag_count = 0;

      while (1) {
        size_t min_start_offset = (size_t) -1;
        size_t max_end_offset = 0;
        snip_cond *cond = NULL;

        /* get condition which have minimum offset and is not stopped */
        for (i = 0; i < snip->cond_len; i++) {
          if (snip->cond[i].stopflag == SNIPCOND_NONSTOP &&
              (min_start_offset > snip->cond[i].start_offset ||
               (min_start_offset == snip->cond[i].start_offset &&
                max_end_offset < snip->cond[i].end_offset))) {
            min_start_offset = snip->cond[i].start_offset;
            max_end_offset = snip->cond[i].end_offset;
            cond = &snip->cond[i];
          }
        }
        if (!cond) {
          break;
        }
        /* check whether condtion is the first condition in snippet */
        if (snip_result->tag_count == 0) {
          /* skip condition if the number of rest snippet field is smaller than */
          /* the number of unfound keywords. */
          if (snip->max_results - *nresults <= unfound_cond_count && cond->count > 0) {
            int_least8_t exclude_other_cond = 1;
            for (i = 0; i < snip->cond_len; i++) {
              if ((snip->cond + i) != cond
                  && snip->cond[i].end_offset <= cond->start_offset + snip->width
                  && snip->cond[i].count == 0) {
                exclude_other_cond = 0;
              }
            }
            if (exclude_other_cond) {
              grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
              continue;
            }
          }
          snip_result->start_offset = cond->start_offset;
          snip_result->first_tag_result_idx = snip->tag_count;
        } else {
          if (cond->start_offset >= snip_result->start_offset + snip->width) {
            break;
          }
          /* check nesting to make valid HTML */
          /* ToDo: allow <test><te>te</te><st>st</st></test> */
          if (cond->start_offset < last_tag_end) {
            grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
            continue;
          }
        }
        if (cond->end_offset > snip_result->start_offset + snip->width) {
          /* If a keyword gets across a snippet, */
          /* it was skipped and never to be tagged. */
          cond->stopflag = SNIPCOND_ACROSS;
          grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
        } else {
          found_cond = 1;
          if (cond->count == 0) {
            unfound_cond_count--;
          }
          cond->count++;
          last_end_offset = cond->end_offset;

          tag_result->cond = cond;
          tag_result->start_offset = cond->start_offset;
          tag_result->end_offset = last_tag_end = cond->end_offset;

          snip_result->tag_count++;
          tag_result++;
          tagged_len += cond->opentag_len + cond->closetag_len;
          if (++snip->tag_count >= MAX_SNIP_TAG_COUNT) {
            break;
          }
          grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
        }
      }
      if (!found_cond) {
        break;
      }
      if (snip_result->start_offset + last_end_offset < snip->width) {
        snip_result->start_offset = 0;
      } else {
        snip_result->start_offset =
          MAX(MIN
              ((snip_result->start_offset + last_end_offset - snip->width) / 2,
               string_len - snip->width), last_last_end_offset);
      }
      snip_result->start_offset =
        grn_snip_find_firstbyte(string, snip->encoding, snip_result->start_offset, 1);

      snip_result->end_offset = snip_result->start_offset + snip->width;
      if (snip_result->end_offset < string_len) {
        snip_result->end_offset =
          grn_snip_find_firstbyte(string, snip->encoding, snip_result->end_offset, -1);
      } else {
        snip_result->end_offset = string_len;
      }
      last_last_end_offset = snip_result->end_offset;

      if (snip->mapping == (grn_snip_mapping *) -1) {
        tagged_len +=
          count_mapped_chars(&string[snip_result->start_offset],
                             &string[snip_result->end_offset]) + 1;
      } else {
        tagged_len += snip_result->end_offset - snip_result->start_offset + 1;
      }

      *max_tagged_len = MAX(*max_tagged_len, tagged_len);

      snip_result->last_tag_result_idx = snip->tag_count - 1;
      (*nresults)++;
      snip_result++;

      if (*nresults == snip->max_results || snip->tag_count == MAX_SNIP_TAG_COUNT) {
        break;
      }
      for (i = 0; i < snip->cond_len; i++) {
        if (snip->cond[i].stopflag != SNIPCOND_STOP) {
          all_stop = 0;
          snip->cond[i].stopflag = SNIPCOND_NONSTOP;
        }
      }
      if (all_stop) {
        break;
      }
    }
  }
  snip->snip_count = *nresults;
  snip->string = string;

  snip->max_tagged_len = *max_tagged_len;

  GRN_API_RETURN(ctx->rc);
}