C++ (Cpp) strfield Exemples

Exemple #1

0

Afficher le fichier

Fichier : mail_daemon.c Projet : marioaugustorama/tropix-cmd

/*
 ****************************************************************
 *	Lê e processa o arquivo					*
 ****************************************************************
 */
void
read_and_process_file (void)
{
	int		file_fd;
	STAT		file_stat;
	char		*file_area, *line;

	/*
	 *	Abre o arquivo, copia todas as linhas para a memória, e apaga o arquivo
	 */
	if ((file_fd = open (mail_daemon_file_nm, O_RDONLY)) < 0)
	{
#ifdef	DEBUG
		error ("*Não consegui abrir \"%s\"", mail_daemon_file_nm);
#endif	DEBUG
		return;
	}

	if (fstat (file_fd, &file_stat) < 0)
		{ error ("*Não consegui obter o estado de \"%s\"", mail_daemon_file_nm); goto bad; }

	file_area = alloca (file_stat.st_size + 1);

	if (read (file_fd, file_area, file_stat.st_size) != file_stat.st_size)
		{ error ("*Não consegui ler o arquivo \"%s\"", mail_daemon_file_nm); goto bad; }

	close (file_fd);

	if (unlink (mail_daemon_file_nm) < 0)
		error ("*Não consegui remover \"%s\"", mail_daemon_file_nm);

	file_area[file_stat.st_size] = '\0';

	/*
	 *	Examina agora, cada linha
	 */
	for (line = strfield (file_area, '\n'); line != NOSTR; line = strfield (NOSTR, '\n'))
	{
		if (line[0] != '\0')
			examine_line (line);
	}

	return;

	/*
	 *	Em caso de erro, ...
	 */
    bad:
	close (file_fd);

}	/* end read_and_process_file */

Exemple #2

0

Afficher le fichier

Fichier : htsrobots.c Projet : AnadoluPanteri/httrack

// fil="" : vérifier si règle déja enregistrée
int checkrobots(robots_wizard* robots,char* adr,char* fil) {
  while(robots) {
    if (strfield2(robots->adr,adr)) {
      if (fil[0]) {
        int ptr=0;
        char line[250];
        if (strnotempty(robots->token)) {
          do {
            ptr+=binput(robots->token+ptr,line,200);
            if (line[0]=='/') {    // absolu
              if (strfield(fil,line)) {                 // commence avec ligne
                return -1;        // interdit
              }
            } else {    // relatif
              if (strstrcase(fil,line)) {
                return -1;
              }
            }
          } while( (strnotempty(line)) && (ptr<(int) strlen(robots->token)) );
        }
      } else {
        return -1;
      }
    }
    robots=robots->next;
  }
  return 0;
}

Exemple #3

0

Afficher le fichier

Fichier : htstools.c Projet : eatonmi/Crawler

// Deuxième partie
HTS_INLINE int __rech_tageq(const char* adr,const char* s) { 
  int p;
  p=strfield(adr,s);
  if (p) {
    while(is_space(adr[p])) p++;
    if (adr[p]=='=') {
      return p+1;
    }
  }
  return 0;
}

Exemple #4

0

Afficher le fichier

Fichier : htstools.c Projet : eatonmi/Crawler

// same, but check begining of adr wirh s (for <object src="bar.mov" .. hotspot123="foo.html">)
HTS_INLINE int __rech_tageqbegdigits(const char* adr,const char* s) { 
  int p;
  p=strfield(adr,s);
  if (p) {
    while(isdigit((unsigned char)adr[p]))  p++;      // jump digits
    while(is_space(adr[p])) p++;
    if (adr[p]=='=') {
      return p+1;
    }
  }
  return 0;
}

Exemple #5

0

Afficher le fichier

Fichier : htstools.c Projet : eatonmi/Crawler

// tag sans =
HTS_INLINE int rech_sampletag(const char* adr,const char* s) { 
  int p;
  if ( (*(adr-1)=='<') || (is_space(*(adr-1))) ) {   // <tag < tag etc
    p=strfield(adr,s);
    if (p) {
      if (!isalnum((unsigned char)adr[p])) {  // <srcbis n'est pas <src
        return 1;
      }
      return 0;
    }
  }
  return 0;
}

Exemple #6

0

Afficher le fichier

Fichier : slibu.c Projet : UlricE/SiagOffice

void lencode_pwent(LISP alist,struct passwd *p)
{p->pw_name = strfield("name",alist);
 p->pw_passwd = strfield("passwd",alist);
 p->pw_uid = longfield("uid",alist);
 p->pw_gid = longfield("gid",alist);
 p->pw_dir = strfield("dir",alist);
 p->pw_gecos = strfield("gecos",alist);
/* FIXME: */
#if defined(__osf__) || defined(hpux) || defined(sun)
 p->pw_comment = strfield("comment",alist);
#endif
#if defined(hpux) || defined(sun)
 p->pw_age = strfield("age",alist);
#endif
#if defined(__osf__)
 p->pw_quota = longfield("quota",alist);
#endif
 p->pw_shell = strfield("shell",alist);}

Exemple #7

0

Afficher le fichier

Fichier : htstools.c Projet : eatonmi/Crawler

HTSEXT_API char* hts_getcategory(const char* filename) {
  String categ = STRING_EMPTY;
  if (fexist(filename)) {
    FILE* fp = fopen(filename, "rb");
    if (fp != NULL) {
      int done=0;
      while(!feof(fp) && !done) {
        char BIGSTK line[1024];
        int n = linput(fp, line, sizeof(line) - 2);
        if (n > 0) {
          if (strfield(line, "category=")) {
            unescapehttp(line+9, &categ);
            done=1;
          }
        }
      }
      fclose(fp);
    }
  }
  return StringBuffRW(categ);
}

Exemple #8

0

Afficher le fichier

Fichier : htstools.c Projet : eatonmi/Crawler

HTSEXT_API char* hts_getcategories(char* path, int type) {
  String categ = STRING_EMPTY;
  String profiles = STRING_EMPTY;
  char* rpath = path;
  find_handle h;
  inthash hashCateg = NULL;
  if (rpath[0]) {
    if (rpath[strlen(rpath)-1]=='/') {
      rpath[strlen(rpath)-1]='\0';      /* note: patching stored (inhash) value */
    }
  }
  h = hts_findfirst(rpath);
  if (h) {
    String iname = STRING_EMPTY;
    if (type == 1) {
      hashCateg = inthash_new(127);
      StringCat(categ, "Test category 1");
      StringCat(categ, "\r\nTest category 2");
    }
    do {
      if (hts_findisdir(h)) {
        char BIGSTK line2[1024];
        StringCopy(iname,rpath);
        StringCat(iname,"/");
        StringCat(iname,hts_findgetname(h));
        StringCat(iname,"/hts-cache/winprofile.ini");
        if (fexist(StringBuff(iname))) {
          if (type == 1) {
            FILE* fp = fopen(StringBuff(iname), "rb");
            if (fp != NULL) {
              int done=0;
              while(!feof(fp) && !done) {
                int n = linput(fp, line2, sizeof(line2) - 2);
                if (n > 0) {
                  if (strfield(line2, "category=")) {
                    if (*(line2+9)) {
                      if (!inthash_read(hashCateg, line2+9, NULL)) {
                        inthash_write(hashCateg, line2+9, 0);
                        if (StringLength(categ) > 0) {
                          StringCat(categ, "\r\n");
                        }
                        unescapehttp(line2+9, &categ);
                      }
                    }
                    done=1;
                  }
                }
              }
              line2[0] = '\0';
              fclose(fp);
            }
          } else {
            if (StringLength(profiles) > 0) {
              StringCat(profiles, "\r\n");
            }
            StringCat(profiles, hts_findgetname(h));
          }
        }
        
      }
    } while(hts_findnext(h));
    hts_findclose(h);
    StringFree(iname);
  }
  if (hashCateg) {
    inthash_delete(&hashCateg);
    hashCateg = NULL;
  }
  if (type == 1)
    return StringBuffRW(categ);
  else
    return StringBuffRW(profiles);
}

Exemple #9

0

Afficher le fichier

Fichier : htstools.c Projet : eatonmi/Crawler

// forme à partir d'un lien et du contexte (origin_fil et origin_adr d'où il est tiré) adr et fil
// [adr et fil sont des buffers de 1ko]
// 0 : ok
// -1 : erreur
// -2 : protocole non supporté (ftp)
int ident_url_relatif(const char *lien,const char* origin_adr,const char* origin_fil,char* adr,char* fil) {
  int ok=0;
  int scheme=0;

  adr[0]='\0'; fil[0]='\0';    //effacer buffers

  // lien non vide!
  if (strnotempty(lien)==0) return -1;    // erreur!

  // Scheme?
  {
    const char* a=lien;
    while (isalpha((unsigned char)*a))
      a++;
    if (*a == ':')
      scheme=1;
  }

  // filtrer les parazites (mailto & cie)
  // scheme+authority (//)
  if (
               (strfield(lien,"http://"))        // scheme+//
            || (strfield(lien,"file://"))   // scheme+//
            || (strncmp(lien,"//",2)==0)    // // sans scheme (-> default)
       ) {
    if (ident_url_absolute(lien,adr,fil)==-1) {        
      ok=-1;    // erreur URL
    }
  }
  else if (strfield(lien,"ftp://")) {
    // Note: ftp:foobar.gif is not valid
    if (ftp_available()) {     // ftp supporté
      if (ident_url_absolute(lien,adr,fil)==-1) {        
        ok=-1;    // erreur URL
      }
    } else {
      ok=-2;  // non supporté
    }
#if HTS_USEMMS
	} else if (strfield(lien,"mms://")) {
		if (ident_url_absolute(lien,adr,fil)==-1) {        
			ok=-1;    // erreur URL
		}
#endif
#if HTS_USEOPENSSL
  } else if (strfield(lien,"https://")) {
    if (SSL_is_available) {
      // Note: ftp:foobar.gif is not valid
      if (ident_url_absolute(lien,adr,fil)==-1) {        
        ok=-1;    // erreur URL
      }
    } else {
      ok=-1;
    }
#endif
  } else if ((scheme) && (
    (!strfield(lien,"http:"))
    && (!strfield(lien,"https:"))
    && (!strfield(lien,"ftp:"))
#if HTS_USEMMS
    && (!strfield(lien,"mms:"))
#endif
    )) {
    ok=-1;      // unknown scheme
  } else {    // c'est un lien relatif
    // On forme l'URL complète à partie de l'url actuelle
    // et du chemin actuel si besoin est.
    
    // copier adresse
    if (((int) strlen(origin_adr)<HTS_URLMAXSIZE) && ((int) strlen(origin_fil)<HTS_URLMAXSIZE) && ((int) strlen(lien)<HTS_URLMAXSIZE)) {

      /* patch scheme if necessary */
      if (strfield(lien,"http:")) {
        lien+=5;
        strcpybuff(adr, jump_protocol(origin_adr));    // même adresse ; protocole vide (http)
      } else if (strfield(lien,"https:")) {
        lien+=6;
        strcpybuff(adr, "https://");   // même adresse forcée en https
        strcatbuff(adr, jump_protocol(origin_adr));
      } else if (strfield(lien,"ftp:")) {
        lien+=4;
        strcpybuff(adr, "ftp://");   // même adresse forcée en ftp
        strcatbuff(adr, jump_protocol(origin_adr));
#if HTS_USEMMS
      } else if (strfield(lien,"mms:")) {
        lien+=4;
        strcpybuff(adr, "mms://");   // même adresse forcée en ftp
        strcatbuff(adr, jump_protocol(origin_adr));
#endif
      } else {
        strcpybuff(adr,origin_adr);    // même adresse ; et même éventuel protocole
      }
      
      if (*lien!='/') {  // sinon c'est un lien absolu
        if (*lien == '\0') {
          strcpybuff(fil,origin_fil);
        } else if (*lien == '?') {     // example: a href="?page=2"
          char* a;
          strcpybuff(fil,origin_fil);
          a=strchr(fil,'?');
          if (a) *a='\0';
          strcatbuff(fil,lien);
        } else {
          const char *a=strchr(origin_fil,'?');
          if (a == NULL) a=origin_fil+strlen(origin_fil);
          while((*a!='/') && ( a > origin_fil) ) a--;
          if (*a=='/') {    // ok on a un '/'
            if ( (((int) (a - origin_fil))+1+strlen(lien)) < HTS_URLMAXSIZE) {
              // copier chemin
              strncpy(fil,origin_fil,((int) (a - origin_fil))+1);
              *(fil + ((int) (a - origin_fil))+1)='\0';
              
              // copier chemin relatif
              if (((int) strlen(fil)+(int) strlen(lien)) < HTS_URLMAXSIZE) {
                strcatbuff(fil,lien + ((*lien=='/')?1:0) );      
                // simplifier url pour les ../
                fil_simplifie(fil);
              } else
                ok=-1;    // erreur
            } else {    // erreur
              ok=-1;    // erreur URL
            }
          } else {    // erreur
            ok=-1;    // erreur URL
          }
        }
      } else { // chemin absolu
        // copier chemin directement
        strcatbuff(fil,lien);      
        fil_simplifie(fil);
      }  // *lien!='/'
    } else
      ok=-1;
    
  }  // test news: etc.

  // case insensitive pour adresse
  {
    char *a=jump_identification(adr);
    while(*a) {
      if ((*a>='A') && (*a<='Z'))
        *a+='a'-'A';       
      a++;
    }
  }
  
  return ok;
}

Exemple #10

0

Afficher le fichier

Fichier : htsfilters.c Projet : Digitalbil/httrack

// supercomparateur joker (tm)
// compare a et b (b=avec joker dedans), case insensitive [voir CI]
// renvoi l'adresse de la première lettre de la chaine
// (càd *[..]toto.. renvoi adresse de toto dans la chaine)
// accepte les délires du genre www.*.*/ * / * truc*.*
// cet algo est 'un peu' récursif mais ne consomme pas trop de tm
// * = toute lettre
// --?-- : spécifique à HTTrack et aux ?
HTS_INLINE const char *strjoker(const char *chaine, const char *joker, LLint * size,
                          int *size_flag) {
  //int err=0;
  if (strnotempty(joker) == 0) {        // fin de chaine joker
    if (strnotempty(chaine) == 0)       // fin aussi pour la chaine: ok
      return chaine;
    else if (chaine[0] == '?')
      return chaine;            // --?-- pour les index.html?Choix=2
    else
      return NULL;              // non trouvé
  }
  // on va progresser en suivant les 'mots' contenus dans le joker
  // un mot peut être un * ou bien toute autre séquence de lettres

  if (strcmp(joker, "*") == 0) {        // ok, rien après
    return chaine;
  }
  // 1er cas: jokers * ou jokers multiples *[..]
  if (joker[0] == '*') {        // comparer joker+reste (*toto/..)
    int jmp;                    // nombre de caractères pour le prochain mot dans joker
    int cut = 0;                // interdire tout caractère superflu
    char pass[256];
    char LEFT = '[', RIGHT = ']';
    int unique = 0;

    switch (joker[1]) {
    case '[':
      LEFT = '[';
      RIGHT = ']';
      unique = 0;
      break;
    case '(':
      LEFT = '(';
      RIGHT = ')';
      unique = 1;
      break;
    }

    if ((joker[1] == LEFT) && (joker[2] != LEFT)) {     // multijoker (tm)
      int i;

      for(i = 0; i < 256; i++)
        pass[i] = 0;

      // noms réservés
      if ((strfield(joker + 2, "file")) || (strfield(joker + 2, "name"))) {
        for(i = 0; i < 256; i++)
          pass[i] = 1;
        pass[(int) '?'] = 0;
        //pass[(int) ';'] = 0;
        pass[(int) '/'] = 0;
        i = 2;
        {
          int len = (int) strlen(joker);

          while((joker[i] != RIGHT) && (joker[i]) && (i < len))
            i++;
        }
      } else if (strfield(joker + 2, "path")) {
        for(i = 0; i < 256; i++)
          pass[i] = 1;
        pass[(int) '?'] = 0;
        //pass[(int) ';'] = 0;
        i = 2;
        {
          int len = (int) strlen(joker);

          while((joker[i] != RIGHT) && (joker[i]) && (i < len))
            i++;
        }
      } else if (strfield(joker + 2, "param")) {
        if (chaine[0] == '?') { // il y a un paramètre juste là
          for(i = 0; i < 256; i++)
            pass[i] = 1;
        }                       // sinon synonyme de 'rien'
        i = 2;
        {
          int len = (int) strlen(joker);

          while((joker[i] != RIGHT) && (joker[i]) && (i < len))
            i++;
        }
      } else {
        // décode les directives comme *[A-Z,âêîôû,0-9]
        i = 2;
        if (joker[i] == RIGHT) {        // *[] signifie "plus rien après"
          cut = 1;              // caractère supplémentaire interdit
        } else {
          int len = (int) strlen(joker);

          while((joker[i] != RIGHT) && (joker[i]) && (i < len)) {
            if ((joker[i] == '<') || (joker[i] == '>')) {       // *[<10]
              int lsize = 0;
              int lverdict;

              i++;
              if (sscanf(joker + i, "%d", &lsize) == 1) {
                if (size) {
                  if (*size >= 0) {
                    if (size_flag)
                      *size_flag = 1;   /* a joué */
                    if (joker[i - 1] == '<')
                      lverdict = (*size < lsize);
                    else
                      lverdict = (*size > lsize);
                    if (!lverdict) {
                      return NULL;      // ne correspond pas
                    } else {
                      *size = lsize;
                      return chaine;    // ok
                    }
                  } else
                    return NULL;        // ne correspond pas
                } else
                  return NULL;  // ne correspond pas (test impossible)
                // jump
                while(isdigit((unsigned char) joker[i]))
                  i++;
              }
            } else if (joker[i + 1] == '-') {   // 2 car, ex: *[A-Z]
              if ((int) (unsigned char) joker[i + 2] >
                  (int) (unsigned char) joker[i]) {
                int j;

                for(j = (int) (unsigned char) joker[i];
                    j <= (int) (unsigned char) joker[i + 2]; j++)
                  pass[j] = 1;

              }
              // else err=1;
              i += 3;
            } else {            // 1 car, ex: *[ ]
              if (joker[i + 2] == '\\' && joker[i + 3] != 0) {  // escaped char, such as *[\[] or *[\]]
                i++;
              }
              pass[(int) (unsigned char) joker[i]] = 1;
              i++;
            }
            if ((joker[i] == ',') || (joker[i] == ';'))
              i++;
          }
        }
      }
      // à sauter dans joker
      jmp = i;
      if (joker[i])
        jmp++;

      //
    } else {                    // tout autoriser
      //
      int i;

      for(i = 0; i < 256; i++)
        pass[i] = 1;            // tout autoriser
      jmp = 1;
      ////if (joker[2]==LEFT) jmp=3;        // permet de recher *<crochet ouvrant>
    }

    {
      int i, max;
      const char *adr;

      // la chaine doit se terminer exactement
      if (cut) {
        if (strnotempty(chaine))
          return NULL;          // perdu
        else
          return chaine;        // ok
      }
      // comparaison en boucle, c'est ca qui consomme huhu..
      // le tableau pass[256] indique les caractères ASCII autorisés

      // tester sans le joker (pas ()+ mais ()*)
      if (!unique) {
        if ((adr = strjoker(chaine, joker + jmp, size, size_flag))) {
          return adr;
        }
      }
      // tester
      i = 0;
      if (!unique)
        max = (int) strlen(chaine);
      else                      /* *(a) only match a (not aaaaa) */
        max = 1;
      while(i < (int) max) {
        if (pass[(int) (unsigned char) chaine[i]]) {    // caractère autorisé
          if ((adr = strjoker(chaine + i + 1, joker + jmp, size, size_flag))) {
            return adr;
          }
          i++;
        } else
          i = max + 2;          // sortir
      }

      // tester chaîne vide
      if (i != max + 2)         // avant c'est ok
        if ((adr = strjoker(chaine + max, joker + jmp, size, size_flag)))
          return adr;

      return NULL;              // perdu
    }

  } else {                      // comparer mot+reste (toto*..)
    if (strnotempty(chaine)) {
      int jmp = 0, ok = 1;

      // comparer début de joker et début de chaine
      while((joker[jmp] != '*') && (joker[jmp]) && (ok)) {
        // CI : remplacer streql par une comparaison !=
        if (!streql(chaine[jmp], joker[jmp])) {
          ok = 0;               // quitter
        }
        jmp++;
      }

      // comparaison ok?
      if (ok) {
        // continuer la comparaison.
        if (strjoker(chaine + jmp, joker + jmp, size, size_flag))
          return chaine;        // retourner 1e lettre
      }

    }                           // strlen(a)
    return NULL;
  }                             // * ou mot

  return NULL;
}

Exemple #11

0

Afficher le fichier

Fichier : htsindex.c Projet : eatonmi/Crawler

/* 
   Indexing system
   A little bit dirty, (quick'n dirty, in fact)
   But should be okay on most cases
   Tags and javascript handled (ignored)
*/
int index_keyword(const char* html_data,LLint size,const char* mime,const char* filename,const char* indexpath) {
#if HTS_MAKE_KEYWORD_INDEX
	char catbuff[CATBUFF_SIZE];
  int intag=0,inscript=0,incomment=0;
  char keyword[KEYW_LEN+32];
  int i=0;
  //
  int WordIndexSize=1024;
  inthash WordIndexHash=NULL;
  FILE *tmpfp=NULL;
  //

  // Check parameters
  if (!html_data)
    return 0;
  if (!size)
    return 0;
  if (!mime)
    return 0;
  if (!filename)
    return 0;

  // Init ?
  if (hts_index_init) {
    remove(concat(catbuff,indexpath,"index.txt"));
    remove(concat(catbuff,indexpath,"sindex.html"));
    hts_index_init=0;
  }

  // Check MIME type
  if (is_html_mime_type(mime)) {
    inscript=0;
  } 
  // FIXME - temporary fix for image/svg+xml (svg)
  // "IN XML" (html like, in fact :) )
  else if (
    (strfield2(mime,"image/svg+xml"))
    ||
    (strfield2(mime,"image/svg-xml"))
#if HTS_USEMMS
		||
		strfield2(mime,"video/x-ms-asf")
#endif
    ) {
    inscript=0;
  }
  else if (
    (strfield2(mime,"application/x-javascript"))
    || (strfield2(mime,"text/css"))
    ) {
    inscript=1;
  //} else if (strfield2(mime, "text/vnd.wap.wml")) {   // humm won't work in many cases
  //  inscript=0;
  } else
    return 0;

  // Temporary file
  tmpfp = tmpfile();
  if (!tmpfp)
    return 0;

  // Create hash structure
  // Hash tables rulez da world!
  WordIndexHash=inthash_new(WordIndexSize);
  if (!WordIndexHash)
    return 0;

  // Start indexing this page
  keyword[0]='\0';
  while(i<size) {
    if (strfield(html_data + i , "<script")) {
      inscript=1;
    } 
    else if (strfield(html_data + i , "<!--")) {
      incomment=1;
    }
    else if (strfield(html_data + i , "</script")) {
      if (!incomment)
        inscript=0;
    } 
    else if (strfield(html_data + i , "-->")) {
      incomment=0;
    }
    else if (html_data[i]=='<') {
      if (!inscript)
        intag=1;
    }    
    else if (html_data[i]=='>') {
      intag=0;
    }    
    else {    
      // Okay, parse keywords
      if ( (!inscript) && (!incomment) && (!intag) ) {
        char cchar=html_data[i];
        int pos;
        int len = (int) strlen(keyword);
        
        // Replace (ignore case, and so on..)
        if ((pos=strcpos(KEYW_TRANSCODE_FROM,cchar))>=0)
          cchar=KEYW_TRANSCODE_TO[pos];
        
        if (strchr(KEYW_ACCEPT,cchar)) {
          /* Ignore some characters at begining */
          if ((len>0) || (!strchr(KEYW_IGNORE_BEG,cchar))) {
            keyword[len++]=cchar;
            keyword[len]='\0';
          }
        } else if ( (strchr(KEYW_SPACE,cchar)) || (!cchar) ) {


          /* Avoid these words */
          if (len>0) {
            if (strchr(KEYW_NOT_BEG,keyword[0])) {
              keyword[(len=0)]='\0';
            }
          }

          /* Strip ending . and so */
          {
            int ok=0;
            while((len = (int) strlen(keyword)) && (!ok)) {
              if (strchr(KEYW_STRIP_END,keyword[len-1])) {      /* strip it */
                keyword[len-1]='\0';
              } else
                ok=1;
            }
          }
          
          /* Store it ? */
          if (len >= KEYW_MIN_LEN ) {
            hts_primindex_words++;
            if (inthash_inc(WordIndexHash,keyword)) {   /* added new */
              fprintf(tmpfp,"%s\n",keyword);
            }
          }
          keyword[(len=0)]='\0';
        } else      /* Invalid */
          keyword[(len=0)]='\0';

        if (len>KEYW_LEN) {
          keyword[(len=0)]='\0';
        }
      }
      
    }
    
    i++;
  }

  // Reset temp file
  fseek(tmpfp,0,SEEK_SET);

  // Process indexing for this page
  {
    //FILE* fp=NULL;
    //fp=fopen(concat(indexpath,"index.txt"),"ab");
    if (fp_tmpproject) {
      while(!feof(tmpfp)) {
        char line[KEYW_LEN + 32];
        linput(tmpfp,line,KEYW_LEN + 2);
        if (strnotempty(line)) {
          intptr_t e=0;
          if (inthash_read(WordIndexHash,line,&e)) {
            //if (e) {
            char BIGSTK savelst[HTS_URLMAXSIZE*2];
            e++;          /* 0 means "once" */
            
            if (strncmp((const char*)fslash(catbuff,(char*)indexpath),filename,strlen(indexpath))==0)  // couper
              strcpybuff(savelst,filename+strlen(indexpath));
            else
              strcpybuff(savelst,filename);
            
            // Add entry for this file and word
            fprintf(fp_tmpproject,"%s %d %s\n",line,(int) (KEYW_SORT_MAXCOUNT - e),savelst);
            hts_primindex_size++;
            //}
          }
        }
      }
      //fclose(fp);
    }
  }

  // Delete temp file
  fclose(tmpfp);
  tmpfp=NULL;

  // Clear hash table
  inthash_delete(&WordIndexHash);
#endif
  return 1;
}

Exemple #12

0

Afficher le fichier

Fichier : htswizard.c Projet : eatonmi/Crawler

static int hts_acceptlink_(httrackp* opt,
													int ptr,int lien_tot,lien_url** liens,
													char* adr,char* fil,
													char* tag, char* attribute,
													int* set_prio_to,
													int* just_test_it) 
{
  int forbidden_url=-1;
  int meme_adresse;
	int embedded_triggered = 0;
#define _FILTERS     (*opt->filters.filters)
#define _FILTERS_PTR (opt->filters.filptr)
#define _ROBOTS      ((robots_wizard*)opt->robotsptr)
  int may_set_prio_to=0;

  // -------------------- PHASE 0 --------------------

  /* Infos */
  if ((opt->debug>1) && (opt->log!=NULL)) {
    HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"wizard test begins: %s%s"LF,adr,fil);
    test_flush;
  }
  
  /* Already exists? Then, we know that we knew that this link had to be known */
  if (adr[0] != '\0'
    && fil[0] != '\0'
    && opt->hash != NULL
    && hash_read(opt->hash, adr, fil, 1, opt->urlhack) >= 0
    ) {
    return 0;  /* Yokai */
  }
  
  // -------------------- PRELUDE OF PHASE 3-BIS --------------------

	/* Built-in known tags (<img src=..>, ..) */
	if (forbidden_url != 0 && opt->nearlink && tag != NULL && attribute != NULL) {
		int i;
		for(i = 0 ; hts_detect_embed[i].tag != NULL ; i++) {
			if (cmp_token(tag, hts_detect_embed[i].tag)
				&& cmp_token(attribute, hts_detect_embed[i].attr)
				) 
			{
				embedded_triggered = 1;
				break;
			}
		}
	}


  // -------------------- PHASE 1 --------------------

  /* Doit-on traiter les non html? */
  if ((opt->getmode & 2)==0) {    // non on ne doit pas
    if (!ishtml(opt,fil)) {  // non il ne faut pas
      //adr[0]='\0';    // ne pas traiter ce lien, pas traiter
      forbidden_url=1;    // interdire récupération du lien
      if ((opt->debug>1) && (opt->log!=NULL)) {
        HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"non-html file ignored at %s : %s"LF,adr,fil);
        test_flush;
      }
      
    }
  }
  
  /* Niveau 1: ne pas parser suivant! */
  if (ptr>0) {
    if ( ( liens[ptr]->depth <= 0 ) || ( liens[ptr]->depth <= 1 && !embedded_triggered ) ) {
      forbidden_url=1;    // interdire récupération du lien
      if ((opt->debug>1) && (opt->log!=NULL)) {
        HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"file from too far level ignored at %s : %s"LF,adr,fil);
        test_flush;
      }
    }
  }

  /* en cas d'échec en phase 1, retour immédiat! */
  if (forbidden_url == 1) {
    return forbidden_url;
  }
  
  // -------------------- PHASE 2 --------------------

  // ------------------------------------------------------
  // doit-on traiter ce lien?.. vérifier droits de déplacement
  meme_adresse=strfield2(adr,urladr);
  if ((opt->debug>1) && (opt->log!=NULL)) {
    HTS_LOG(opt,LOG_DEBUG); 
    if (meme_adresse) 
      fprintf(opt->log,"Compare addresses: %s=%s"LF,adr,urladr);
    else
      fprintf(opt->log,"Compare addresses: %s!=%s"LF,adr,urladr);
    test_flush;
  }
  if (meme_adresse) {  // même adresse 
    {  // tester interdiction de descendre
      // MODIFIE : en cas de remontée puis de redescente, il se pouvait qu'on ne puisse pas atteindre certains fichiers
      // problème: si un fichier est virtuellement accessible via une page mais dont le lien est sur une autre *uniquement*..
      char BIGSTK tempo[HTS_URLMAXSIZE*2];
      char BIGSTK tempo2[HTS_URLMAXSIZE*2];
      tempo[0] = tempo2[0] = '\0';
      
      // note (up/down): on calcule à partir du lien primaire, ET du lien précédent.
      // ex: si on descend 2 fois on peut remonter 1 fois
      
      if (lienrelatif(tempo,fil,liens[liens[ptr]->premier]->fil)==0) {
        if (lienrelatif(tempo2,fil,liens[ptr]->fil)==0) {
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"build relative links to test: %s %s (with %s and %s)"LF,tempo,tempo2,liens[liens[ptr]->premier]->fil,liens[ptr]->fil);
            test_flush;
          }
          
          // si vient de primary, ne pas tester lienrelatif avec (car host "différent")
          /*if (liens[liens[ptr]->premier] == 0) {   // vient de primary
          }
          */
          
          // NEW: finalement OK, sauf pour les moved repérés par link_import
          // PROBLEME : annulé a cause d'un lien éventuel isolé accepté..qui entrainerait un miroir
          
          // (test même niveau (NOUVEAU à cause de certains problèmes de filtres non intégrés))
          // NEW
          if ( 
            (tempo[0]  != '\0' && tempo[1]  != '\0' && strchr(tempo+1,'/') == 0)
            ||
            (tempo2[0] != '\0' && tempo2[1] != '\0' && strchr(tempo2+1,'/') == 0) 
            ) {
            if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
              forbidden_url=0;
              if ((opt->debug>1) && (opt->log!=NULL)) {
                HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"same level link authorized: %s%s"LF,adr,fil);
                test_flush;
             }
            }
          }
          
          // down
          if ( (strncmp(tempo,"../",3)) || (strncmp(tempo2,"../",3)))  {   // pas montée sinon ne nbous concerne pas
            int test1,test2;
            if (!strncmp(tempo,"../",3))
              test1=0;
            else
              test1 = (strchr(tempo +((*tempo =='/')?1:0),'/')!=NULL);
            if (!strncmp(tempo2,"../",3))
              test2=0;
            else
              test2 = (strchr(tempo2+((*tempo2=='/')?1:0),'/')!=NULL);
            if ( (test1) && (test2) ) {   // on ne peut que descendre
              if ((opt->seeker & 1)==0) {  // interdiction de descendre
                forbidden_url=1;
                if ((opt->debug>1) && (opt->log!=NULL)) {
                  HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"lower link canceled: %s%s"LF,adr,fil);
                  test_flush;
                }
              } else {    // autorisé à priori - NEW
                if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
                  forbidden_url=0;
                  if ((opt->debug>1) && (opt->log!=NULL)) {
                    HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"lower link authorized: %s%s"LF,adr,fil);
                    test_flush;
                  }
                }
              }
            } else if ( (test1) || (test2) ) {   // on peut descendre pour accéder au lien
              if ((opt->seeker & 1)!=0) {  // on peut descendre - NEW
                if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
                  forbidden_url=0;
                  if ((opt->debug>1) && (opt->log!=NULL)) {
                    HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"lower link authorized: %s%s"LF,adr,fil);
                    test_flush;
                  }
                }
              }
            }
          }
          
          
          // up
          if ( (!strncmp(tempo,"../",3)) && (!strncmp(tempo2,"../",3)) ) {    // impossible sans monter
            if ((opt->seeker & 2)==0) {  // interdiction de monter
              forbidden_url=1;
              if ((opt->debug>1) && (opt->log!=NULL)) {
                HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"upper link canceled: %s%s"LF,adr,fil);
                test_flush;
              }
            } else {       // autorisé à monter - NEW
              if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
                forbidden_url=0;
                if ((opt->debug>1) && (opt->log!=NULL)) {
                  HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"upper link authorized: %s%s"LF,adr,fil);
                  test_flush;
                }
              }
            }
          } else if ( (!strncmp(tempo,"../",3)) || (!strncmp(tempo2,"../",3)) ) {    // Possible en montant
            if ((opt->seeker & 2)!=0) {  // autorisé à monter - NEW
              if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
                forbidden_url=0;
                if ((opt->debug>1) && (opt->log!=NULL)) {
                  HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"upper link authorized: %s%s"LF,adr,fil);
                  test_flush;
                }
              }
            }  // sinon autorisé en descente
          }
          
          
        } else {
          if (opt->log) {
            fprintf(opt->log,"Error building relative link %s and %s"LF,fil,liens[ptr]->fil);
            test_flush;
          }
        }
      } else {
        if (opt->log) {
          fprintf(opt->log,"Error building relative link %s and %s"LF,fil,liens[liens[ptr]->premier]->fil);
          test_flush;
        }
      }
      
    }  // tester interdiction de descendre?
    
    {  // tester interdiction de monter
      char BIGSTK tempo[HTS_URLMAXSIZE*2];
      char BIGSTK tempo2[HTS_URLMAXSIZE*2];
      if (lienrelatif(tempo,fil,liens[liens[ptr]->premier]->fil)==0) {
        if (lienrelatif(tempo2,fil,liens[ptr]->fil)==0) {
        } else {
          if (opt->log) { 
            fprintf(opt->log,"Error building relative link %s and %s"LF,fil,liens[ptr]->fil);
            test_flush;
          }
          
        }
      } else {
        if (opt->log) { 
          fprintf(opt->log,"Error building relative link %s and %s"LF,fil,liens[liens[ptr]->premier]->fil);
          test_flush;
        }
        
      }
    }   // fin tester interdiction de monter
    
  } else {    // adresse différente, sortir?
    
    //if (!opt->wizard) {    // mode non wizard
    // doit-on traiter ce lien?.. vérifier droits de sortie
    switch((opt->travel & 255)) {
    case 0: 
      if (!opt->wizard)    // mode non wizard
        forbidden_url=1; break;    // interdicton de sortir au dela de l'adresse
    case 1: {              // sortie sur le même dom.xxx
      size_t i = strlen(adr)-1;
      size_t j = strlen(urladr)-1;
      while( (i>0) && (adr[i]!='.')) i--;
      while( (j>0) && (urladr[j]!='.')) j--;
      i--; j--;
      while( (i>0) && (adr[i]!='.')) i--;
      while( (j>0) && (urladr[j]!='.')) j--;
      if ((i>0) && (j>0)) {
        if (!strfield2(adr+i,urladr+j)) {   // !=
          if (!opt->wizard) {   // mode non wizard
            //printf("refused: %s\n",adr);
            forbidden_url=1;  // pas même domaine  
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"foreign domain link canceled: %s%s"LF,adr,fil);
              test_flush;
            }
          }
          
        } else {
          if (opt->wizard) {   // mode wizard
            forbidden_url=0;  // même domaine  
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"same domain link authorized: %s%s"LF,adr,fil);
              test_flush;
            }
          }
        }
        
      } else
        forbidden_url=1;
            } 
      break;  
    case 2: {                      // sortie sur le même .xxx
      size_t i = strlen(adr)-1;
      size_t j = strlen(urladr)-1;
      while( (i>0) && (adr[i]!='.')) i--;
      while( (j>0) && (urladr[j]!='.')) j--;
      if ((i>0) && (j>0)) {
        if (!strfield2(adr+i,urladr+j)) {   // !-
          if (!opt->wizard) {   // mode non wizard
            //printf("refused: %s\n",adr);
            forbidden_url=1;  // pas même .xx  
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"foreign location link canceled: %s%s"LF,adr,fil);
              test_flush;
            }
          }
        } else {
          if (opt->wizard) {   // mode wizard
            forbidden_url=0;  // même domaine  
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"same location link authorized: %s%s"LF,adr,fil);
              test_flush;
            }
          }
        }
      } else forbidden_url=1;     
            } 
      break;
    case 7:                 // everywhere!!
      if (opt->wizard) {   // mode wizard
        forbidden_url=0;
        break;
      }
    }  // switch
    
    // ANCIENNE POS -- récupérer les liens à côtés d'un lien (nearlink)
    
  }  // fin test adresse identique/différente

  // -------------------- PHASE 3 --------------------

  // récupérer les liens à côtés d'un lien (nearlink) (nvelle pos)
  if (forbidden_url != 0 && opt->nearlink) {
    if (!ishtml(opt,fil)) {  // non html
      //printf("ok %s%s\n",ad,fil);
      forbidden_url=0;    // autoriser
      may_set_prio_to=1+1; // set prio to 1 (parse but skip urls) if near is the winner
      if ((opt->debug>1) && (opt->log!=NULL)) {
        HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"near link authorized: %s%s"LF,adr,fil);
        test_flush;
      }
    }
  }

  // -------------------- PHASE 3-BIS --------------------

	/* Built-in known tags (<img src=..>, ..) */
	if (forbidden_url != 0 && embedded_triggered) {
		forbidden_url=0;    // autoriser
		may_set_prio_to=1+1; // set prio to 1 (parse but skip urls) if near is the winner
		if ((opt->debug>1) && (opt->log!=NULL)) {
			HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"near link authorized (friendly tag): %s%s"LF,adr,fil);
			test_flush;
		}
	}


  // -------------------- PHASE 4 --------------------
  
  // ------------------------------------------------------
  // Si wizard, il se peut qu'on autorise ou qu'on interdise 
  // un lien spécial avant même de tester sa position, sa hiérarchie etc.
  // peut court-circuiter le forbidden_url précédent
  if (opt->wizard) { // le wizard entre en action..
    //
    int question=1;         // poser une question                            
    int force_mirror=0;     // pour mirror links
    int filters_answer=0;   // décision prise par les filtres
    char BIGSTK l[HTS_URLMAXSIZE*2];
    char BIGSTK lfull[HTS_URLMAXSIZE*2];
    
    if (forbidden_url!=-1) question=0;  // pas de question, résolu
    
    // former URL complète du lien actuel
    strcpybuff(l,jump_identification(adr));
    if (*fil!='/') strcatbuff(l,"/");
    strcatbuff(l,fil);
    // full version (http://foo:[email protected]/bar.html)
    if (!link_has_authority(adr))
      strcpybuff(lfull,"http://");
    else
      lfull[0]='\0';
    strcatbuff(lfull,adr);
    if (*fil!='/') strcatbuff(lfull,"/");
    strcatbuff(lfull,fil);
    
    // tester filters (URLs autorisées ou interdites explicitement)
    
    // si lien primaire on saute le joker, on est pas lémur
    if (ptr==0) {  // lien primaire, autoriser
      question=1;    // la question sera résolue automatiquement
      forbidden_url=0;
      may_set_prio_to=0;    // clear may-set flag
    } else {
      // eternal depth first
      // vérifier récursivité extérieure
      if (opt->extdepth>0) {
        if ( /*question && */ (ptr>0) && (!force_mirror)) {
          // well, this is kinda a hak
          // we don't want to mirror EVERYTHING, and we have to decide where to stop
          // there is no way yet to tag "external" links, and therefore links that are
          // "weak" (authorized depth < external depth) are just not considered for external
          // hack
          if (liens[ptr]->depth > opt->extdepth) {
            // *set_prio_to = opt->extdepth + 1;
            *set_prio_to = 1 + (opt->extdepth);
            may_set_prio_to=0;  // clear may-set flag
            forbidden_url=0;    // autorisé
            question=0;         // résolution auto
            if ((opt->debug>1) && (opt->log!=NULL)) {
              if (question) {
                HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) ambiguous link accepted (external depth): link %s at %s%s"LF,l,urladr,urlfil);
              } else {
                HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) forced to accept link (external depth): link %s at %s%s"LF,l,urladr,urlfil);
              }
              test_flush;
            }
            
          }
        }
      }  
      
      // filters
      {
        int jok;
        char* mdepth="";
        // filters, 0=sait pas 1=ok -1=interdit
        {
          int jokDepth1=0,jokDepth2=0;
          int jok1=0,jok2=0;
          jok1  = fa_strjoker(/*url*/0, _FILTERS,*_FILTERS_PTR,lfull,NULL,NULL,&jokDepth1);
          jok2 =  fa_strjoker(/*url*/0, _FILTERS,*_FILTERS_PTR,l,    NULL,NULL,&jokDepth2);
          if (jok2 == 0) {      // #2 doesn't know
            jok = jok1;        // then, use #1
            mdepth = _FILTERS[jokDepth1];
          } else if (jok1 == 0) { // #1 doesn't know
            jok = jok2;        // then, use #2
            mdepth = _FILTERS[jokDepth2];
          } else if (jokDepth1 >= jokDepth2) { // #1 matching rule is "after" #2, then it is prioritary
            jok = jok1;
            mdepth = _FILTERS[jokDepth1];
          } else {                             // #2 matching rule is "after" #1, then it is prioritary
            jok = jok2;
            mdepth = _FILTERS[jokDepth2];
          }
        }
        
        if (jok == 1) {   // autorisé
          filters_answer=1;  // décision prise par les filtres
          question=0;    // ne pas poser de question, autorisé
          forbidden_url=0;  // URL autorisée
          may_set_prio_to=0;    // clear may-set flag
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) explicit authorized (%s) link: link %s at %s%s"LF,mdepth,l,urladr,urlfil);
            test_flush;
          }
        } else if (jok == -1) {  // forbidden
          filters_answer=1;  // décision prise par les filtres
          question=0;    // ne pas poser de question:
          forbidden_url=1;   // URL interdite
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) explicit forbidden (%s) link: link %s at %s%s"LF,mdepth,l,urladr,urlfil);
            test_flush;
          }
        }  // sinon on touche à rien
      }
    }
    
    // vérifier mode mirror links
    if (question) {
      if (opt->mirror_first_page) {    // mode mirror links
        if (liens[ptr]->precedent==0) {  // parent=primary!
          forbidden_url=0;    // autorisé
          may_set_prio_to=0;    // clear may-set flag
          question=1;         // résolution auto
          force_mirror=5;     // mirror (5)
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) explicit mirror link: link %s at %s%s"LF,l,urladr,urlfil);
            test_flush;
          }
        }
      }
    }
    
    // on doit poser la question.. peut on la poser?
    // (oui je sais quel preuve de délicatesse, merci merci)      
    if ((question) && (ptr>0) && (!force_mirror)) {
      if (opt->wizard==2) {    // éliminer tous les liens non répertoriés comme autorisés (ou inconnus)
        question=0;
        forbidden_url=1;
        if ((opt->debug>1) && (opt->log!=NULL)) {
          HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) ambiguous forbidden link: link %s at %s%s"LF,l,urladr,urlfil);
          test_flush;
        }
      }
    }
    
    // vérifier robots.txt
    if (opt->robots) {
      int r = checkrobots(_ROBOTS,adr,fil);
      if (r == -1) {    // interdiction
#if DEBUG_ROBOTS
        printf("robots.txt forbidden: %s%s\n",adr,fil);
#endif
        // question résolue, par les filtres, et mode robot non strict
        if ((!question) && (filters_answer) && (opt->robots == 1) && (forbidden_url!=1)) {
          r=0;    // annuler interdiction des robots
          if (!forbidden_url) {
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Warning link followed against robots.txt: link %s at %s%s"LF,l,adr,fil);
              test_flush;
            }
          }
        }
        if (r == -1) {    // interdire
          forbidden_url=1;
          question=0;
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(robots.txt) forbidden link: link %s at %s%s"LF,l,adr,fil);
            test_flush;
          }
        }
      }
    }
    
    if (!question) {
      if ((opt->debug>1) && (opt->log!=NULL)) {
        if (!forbidden_url) {
          HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) shared foreign domain link: link %s at %s%s"LF,l,urladr,urlfil);
        } else {
          HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) cancelled foreign domain link: link %s at %s%s"LF,l,urladr,urlfil);
        }
        test_flush;
      }
#if BDEBUG==3
      printf("at %s in %s, wizard says: url %s ",urladr,urlfil,l);
      if (forbidden_url) printf("cancelled"); else printf(">SHARED<");
      printf("\n");
#endif 
    }

    /* en cas de question, ou lien primaire (enregistrer autorisations) */
    if (question || (ptr==0)) {
      const char* s;
      int n=0;
      
      // si primaire (plus bas) alors ...
      if ((ptr!=0) && (force_mirror==0)) {
        char BIGSTK tempo[HTS_URLMAXSIZE*2];
        tempo[0]='\0';
        strcatbuff(tempo,adr);
        strcatbuff(tempo,fil);
        s = RUN_CALLBACK1(opt, query3, tempo);
        if (strnotempty(s)==0)  // entrée
            n=0;
          else if (isdigit((unsigned char)*s))
            sscanf(s,"%d",&n);
          else {
            switch(*s) {
            case '*': n=-1; break;
            case '!': n=-999; {
              /*char *a;
              int i;                                    
              a=copie_de_adr-128;
              if (a<r.adr) a=r.adr;
              for(i=0;i<256;i++) {
                if (a==copie_de_adr) printf("\nHERE:\n");
                printf("%c",*a++);
              }
              printf("\n\n");
              */
                      }
              break;
            default: n=-999; printf("What did you say?\n"); break;
              
            } 
          }
        io_flush;
      } else {   // lien primaire: autoriser répertoire entier       
        if (!force_mirror) {
          if ((opt->seeker & 1)==0) {  // interdiction de descendre
            n=7;
          } else {
            n=5;   // autoriser miroir répertoires descendants (lien primaire)
          }
        } else   // forcer valeur (sub-wizard)
          n=force_mirror;
      }
      
      /* sanity check - reallocate filters HERE */
      if ((*_FILTERS_PTR) + 1 >= opt->maxfilter) {
        opt->maxfilter += HTS_FILTERSINC;
        if (filters_init(&_FILTERS, opt->maxfilter, HTS_FILTERSINC) == 0) {
          printf("PANIC! : Too many filters : >%d [%d]\n", (*_FILTERS_PTR),__LINE__);
          fflush(stdout);
          if (opt->log) {
            fprintf(opt->log,LF"Too many filters, giving up..(>%d)"LF, (*_FILTERS_PTR) );
            fprintf(opt->log,"To avoid that: use #F option for more filters (example: -#F5000)"LF);
            test_flush;
          }
          assertf("too many filters - giving up" == NULL);    // wild..
        }
      }

      // here we have enough room for a new filter if necessary
      switch(n) {
      case -1: // sauter tout le reste
        forbidden_url=1;
        opt->wizard=2;    // sauter tout le reste
        break;
      case 0:    // interdire les mêmes liens: adr/fil
        forbidden_url=1; 
        HT_INSERT_FILTERS0;    // insérer en 0
        strcpybuff(_FILTERS[0],"-");
        strcatbuff(_FILTERS[0],jump_identification(adr));
        if (*fil!='/') strcatbuff(_FILTERS[0],"/");
        strcatbuff(_FILTERS[0],fil);
        break;
        
      case 1: // éliminer répertoire entier et sous rép: adr/path/ *
        forbidden_url=1;
        {
          size_t i = strlen(fil)-1;
          while((fil[i]!='/') && (i>0)) i--;
          if (fil[i]=='/') {
            HT_INSERT_FILTERS0;    // insérer en 0
            strcpybuff(_FILTERS[0],"-");
            strcatbuff(_FILTERS[0],jump_identification(adr));
            if (*fil!='/') strcatbuff(_FILTERS[0],"/");
            strncatbuff(_FILTERS[0] ,fil,i);
            if (_FILTERS[0][strlen(_FILTERS[0])-1]!='/') 
              strcatbuff(_FILTERS[0],"/");
            strcatbuff(_FILTERS[0],"*");
          }
        }            
        
        // ** ...
        break;
        
      case 2:    // adresse adr*
        forbidden_url=1;
        HT_INSERT_FILTERS0;    // insérer en 0                                
        strcpybuff(_FILTERS[0],"-");
        strcatbuff(_FILTERS[0],jump_identification(adr));
        strcatbuff(_FILTERS[0],"*");
        break;
        
      case 3: // ** A FAIRE
        forbidden_url=1;
        /*
        {
        int i=strlen(adr)-1;
        while((adr[i]!='/') && (i>0)) i--;
        if (i>0) {
        
          }
          
      }*/
        
        break;
        //
      case 4:    // same link
        // PAS BESOIN!!
        /*HT_INSERT_FILTERS0;    // insérer en 0                                
        strcpybuff(_FILTERS[0],"+");
        strcatbuff(_FILTERS[0],adr);
        if (*fil!='/') strcatbuff(_FILTERS[0],"/");
        strcatbuff(_FILTERS[0],fil);*/
        
        
        // étant donné le renversement wizard/primary filter (les primary autorisent up/down ET interdisent)
        // il faut éviter d'un lien isolé effectue un miroir total..
        
        *set_prio_to = 0+1;    // niveau de récursion=0 (pas de miroir)
        
        break;
        
      case 5:    // autoriser répertoire entier et fils
        if ((opt->seeker & 2)==0) {  // interdiction de monter
          size_t i = strlen(fil)-1;
          while((fil[i]!='/') && (i>0)) i--;
          if (fil[i]=='/') {
            HT_INSERT_FILTERS0;    // insérer en 0                                
            strcpybuff(_FILTERS[0],"+");
            strcatbuff(_FILTERS[0],jump_identification(adr));
            if (*fil!='/') strcatbuff(_FILTERS[0],"/");
            strncatbuff(_FILTERS[0],fil,i+1);
            strcatbuff(_FILTERS[0],"*");
          }
        } else {    // autoriser domaine alors!!
          HT_INSERT_FILTERS0;    // insérer en 0                                strcpybuff(filters[filptr],"+");
          strcpybuff(_FILTERS[0],"+");
          strcatbuff(_FILTERS[0],jump_identification(adr));
          strcatbuff(_FILTERS[0],"*");
        }
        break;
        
      case 6:    // same domain
        HT_INSERT_FILTERS0;    // insérer en 0                                strcpybuff(filters[filptr],"+");
        strcpybuff(_FILTERS[0],"+");
        strcatbuff(_FILTERS[0],jump_identification(adr));
        strcatbuff(_FILTERS[0],"*");
        break;
        //
      case 7:    // autoriser ce répertoire
        {
          size_t i = strlen(fil)-1;
          while((fil[i]!='/') && (i>0)) i--;
          if (fil[i]=='/') {
            HT_INSERT_FILTERS0;    // insérer en 0                                
            strcpybuff(_FILTERS[0],"+");
            strcatbuff(_FILTERS[0],jump_identification(adr));
            if (*fil!='/') strcatbuff(_FILTERS[0],"/");
            strncatbuff(_FILTERS[0],fil,i+1);
            strcatbuff(_FILTERS[0],"*[file]");
          }
        }
        
        break;
        
      case 50:    // on fait rien
        break;
      }  // switch 
                              
    }  // test du wizard sur l'url
  }  // fin du test wizard..

  // -------------------- PHASE 5 --------------------

  // lien non autorisé, peut-on juste le tester?
  if (just_test_it) {
    if (forbidden_url==1) {
      if (opt->travel&256) {    // tester tout de même
        if (strfield(adr,"ftp://")==0
#if HTS_USEMMS
					&& strfield(adr,"mms://")==0
#endif
					) {    // PAS ftp!
          forbidden_url=1;    // oui oui toujours interdit (note: sert à rien car ==1 mais c pour comprendre)
          *just_test_it=1;     // mais on teste
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Testing link %s%s"LF,adr,fil);
          }
        }
      }
    }
    //adr[0]='\0';  // cancel
  }

  // -------------------- FINAL PHASE --------------------
  // Test if the "Near" test won
  if (may_set_prio_to && forbidden_url == 0) {
    *set_prio_to = may_set_prio_to;
  }

  return forbidden_url;
#undef _FILTERS
#undef _FILTERS_PTR
#undef _ROBOTS
}

Exemple #13

0

Afficher le fichier

Fichier : htstools.c Projet : AmesianX/HackingStuff

// forme à partir d'un lien et du contexte (origin_fil et origin_adr d'où il est tiré) adr et fil
// [adr et fil sont des buffers de 1ko]
// 0 : ok
// -1 : erreur
// -2 : protocole non supporté (ftp)
int ident_url_relatif(const char *lien, const char *origin_adr,
                      const char *origin_fil,
                      lien_adrfil* const adrfil) {
  int ok = 0;
  int scheme = 0;

  assertf(adrfil != NULL);

  adrfil->adr[0] = '\0';
  adrfil->fil[0] = '\0';                //effacer buffers

  // lien non vide!
  if (strnotempty(lien) == 0)
    return -1;                  // erreur!

  // Scheme?
  {
    const char *a = lien;

    while(isalpha((unsigned char) *a))
      a++;
    if (*a == ':')
      scheme = 1;
  }

  // filtrer les parazites (mailto & cie)
  // scheme+authority (//)
  if ((strfield(lien, "http://"))       // scheme+//
      || (strfield(lien, "file://"))    // scheme+//
      || (strncmp(lien, "//", 2) == 0)  // // sans scheme (-> default)
    ) {
    if (ident_url_absolute(lien, adrfil) == -1) {
      ok = -1;                  // erreur URL
    }
  } else if (strfield(lien, "ftp://")) {
    // Note: ftp:foobar.gif is not valid
    if (ftp_available()) {      // ftp supporté
      if (ident_url_absolute(lien, adrfil) == -1) {
        ok = -1;                // erreur URL
      }
    } else {
      ok = -2;                  // non supporté
    }
#if HTS_USEOPENSSL
  } else if (strfield(lien, "https://")) {
    // Note: ftp:foobar.gif is not valid
    if (ident_url_absolute(lien, adrfil) == -1) {
      ok = -1;                // erreur URL
    }
#endif
  } else if ((scheme) && ((!strfield(lien, "http:"))
                          && (!strfield(lien, "https:"))
                          && (!strfield(lien, "ftp:"))
             )) {
    ok = -1;                    // unknown scheme
  } else {                      // c'est un lien relatif
    // On forme l'URL complète à partie de l'url actuelle
    // et du chemin actuel si besoin est.

    // sanity check
    if (origin_adr == NULL || origin_fil == NULL 
      || *origin_adr == '\0' || *origin_fil == '\0') {
      return -1;
    }

    // copier adresse
    if (((int) strlen(origin_adr) < HTS_URLMAXSIZE)
        && ((int) strlen(origin_fil) < HTS_URLMAXSIZE)
        && ((int) strlen(lien) < HTS_URLMAXSIZE)) {

      /* patch scheme if necessary */
      if (strfield(lien, "http:")) {
        lien += 5;
        strcpybuff(adrfil->adr, jump_protocol_const(origin_adr));     // même adresse ; protocole vide (http)
      } else if (strfield(lien, "https:")) {
        lien += 6;
        strcpybuff(adrfil->adr, "https://");    // même adresse forcée en https
        strcatbuff(adrfil->adr, jump_protocol_const(origin_adr));
      } else if (strfield(lien, "ftp:")) {
        lien += 4;
        strcpybuff(adrfil->adr, "ftp://");      // même adresse forcée en ftp
        strcatbuff(adrfil->adr, jump_protocol_const(origin_adr));
      } else {
        strcpybuff(adrfil->adr, origin_adr);    // même adresse ; et même éventuel protocole
      }

      if (*lien != '/') {       // sinon c'est un lien absolu
        if (*lien == '\0') {
          strcpybuff(adrfil->fil, origin_fil);
        } else if (*lien == '?') {      // example: a href="?page=2"
          char *a;

          strcpybuff(adrfil->fil, origin_fil);
          a = strchr(adrfil->fil, '?');
          if (a)
            *a = '\0';
          strcatbuff(adrfil->fil, lien);
        } else {
          const char *a = strchr(origin_fil, '?');

          if (a == NULL)
            a = origin_fil + strlen(origin_fil);
          while((*a != '/') && (a > origin_fil))
            a--;
          if (*a == '/') {      // ok on a un '/'
            if ((((int) (a - origin_fil)) + 1 + strlen(lien)) < HTS_URLMAXSIZE) {
              // copier chemin
              strncpy(adrfil->fil, origin_fil, ((int) (a - origin_fil)) + 1);
              *(adrfil->fil + ((int) (a - origin_fil)) + 1) = '\0';

              // copier chemin relatif
              if (((int) strlen(adrfil->fil) + (int) strlen(lien)) < HTS_URLMAXSIZE) {
                strcatbuff(adrfil->fil, lien + ((*lien == '/') ? 1 : 0));
                // simplifier url pour les ../
                fil_simplifie(adrfil->fil);
              } else
                ok = -1;        // erreur
            } else {            // erreur
              ok = -1;          // erreur URL
            }
          } else {              // erreur
            ok = -1;            // erreur URL
          }
        }
      } else {                  // chemin absolu
        // copier chemin directement
        strcatbuff(adrfil->fil, lien);
        fil_simplifie(adrfil->fil);
      }                         // *lien!='/'
    } else
      ok = -1;

  }                             // test news: etc.

  // case insensitive pour adresse
  {
    char *a = jump_identification(adrfil->adr);

    while(*a) {
      if ((*a >= 'A') && (*a <= 'Z'))
        *a += 'a' - 'A';
      a++;
    }
  }

  // IDNA / RFC 3492 (Punycode) handling for HTTP(s)
  if (!link_has_authority(adrfil->adr) || strfield(adrfil->adr, "https:")) {
    char *const a = jump_identification(adrfil->adr);
    // Non-ASCII characters (theorically forbidden, but browsers are lenient)
    if (!hts_isStringAscii(a, strlen(a))) {
      char *const idna = hts_convertStringUTF8ToIDNA(a, strlen(a));
      if (idna != NULL) {
        if (strlen(idna) < HTS_URLMAXSIZE) {
          strcpybuff(a, idna);
        }
        free(idna);
      }
    }
  }

  return ok;
}