C++ (Cpp) link_has_authorityの例

プログラミング言語: C++ (Cpp)

メソッド/関数: link_has_authority

hotexamples.comのコード掲載数: 3

C++ (Cpp) link_has_authority - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたC++ (Cpp)のlink_has_authorityの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: htswizard.c プロジェクト: eatonmi/Crawler

static int hts_acceptlink_(httrackp* opt,
													int ptr,int lien_tot,lien_url** liens,
													char* adr,char* fil,
													char* tag, char* attribute,
													int* set_prio_to,
													int* just_test_it) 
{
  int forbidden_url=-1;
  int meme_adresse;
	int embedded_triggered = 0;
#define _FILTERS     (*opt->filters.filters)
#define _FILTERS_PTR (opt->filters.filptr)
#define _ROBOTS      ((robots_wizard*)opt->robotsptr)
  int may_set_prio_to=0;

  // -------------------- PHASE 0 --------------------

  /* Infos */
  if ((opt->debug>1) && (opt->log!=NULL)) {
    HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"wizard test begins: %s%s"LF,adr,fil);
    test_flush;
  }
  
  /* Already exists? Then, we know that we knew that this link had to be known */
  if (adr[0] != '\0'
    && fil[0] != '\0'
    && opt->hash != NULL
    && hash_read(opt->hash, adr, fil, 1, opt->urlhack) >= 0
    ) {
    return 0;  /* Yokai */
  }
  
  // -------------------- PRELUDE OF PHASE 3-BIS --------------------

	/* Built-in known tags (<img src=..>, ..) */
	if (forbidden_url != 0 && opt->nearlink && tag != NULL && attribute != NULL) {
		int i;
		for(i = 0 ; hts_detect_embed[i].tag != NULL ; i++) {
			if (cmp_token(tag, hts_detect_embed[i].tag)
				&& cmp_token(attribute, hts_detect_embed[i].attr)
				) 
			{
				embedded_triggered = 1;
				break;
			}
		}
	}


  // -------------------- PHASE 1 --------------------

  /* Doit-on traiter les non html? */
  if ((opt->getmode & 2)==0) {    // non on ne doit pas
    if (!ishtml(opt,fil)) {  // non il ne faut pas
      //adr[0]='\0';    // ne pas traiter ce lien, pas traiter
      forbidden_url=1;    // interdire récupération du lien
      if ((opt->debug>1) && (opt->log!=NULL)) {
        HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"non-html file ignored at %s : %s"LF,adr,fil);
        test_flush;
      }
      
    }
  }
  
  /* Niveau 1: ne pas parser suivant! */
  if (ptr>0) {
    if ( ( liens[ptr]->depth <= 0 ) || ( liens[ptr]->depth <= 1 && !embedded_triggered ) ) {
      forbidden_url=1;    // interdire récupération du lien
      if ((opt->debug>1) && (opt->log!=NULL)) {
        HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"file from too far level ignored at %s : %s"LF,adr,fil);
        test_flush;
      }
    }
  }

  /* en cas d'échec en phase 1, retour immédiat! */
  if (forbidden_url == 1) {
    return forbidden_url;
  }
  
  // -------------------- PHASE 2 --------------------

  // ------------------------------------------------------
  // doit-on traiter ce lien?.. vérifier droits de déplacement
  meme_adresse=strfield2(adr,urladr);
  if ((opt->debug>1) && (opt->log!=NULL)) {
    HTS_LOG(opt,LOG_DEBUG); 
    if (meme_adresse) 
      fprintf(opt->log,"Compare addresses: %s=%s"LF,adr,urladr);
    else
      fprintf(opt->log,"Compare addresses: %s!=%s"LF,adr,urladr);
    test_flush;
  }
  if (meme_adresse) {  // même adresse 
    {  // tester interdiction de descendre
      // MODIFIE : en cas de remontée puis de redescente, il se pouvait qu'on ne puisse pas atteindre certains fichiers
      // problème: si un fichier est virtuellement accessible via une page mais dont le lien est sur une autre *uniquement*..
      char BIGSTK tempo[HTS_URLMAXSIZE*2];
      char BIGSTK tempo2[HTS_URLMAXSIZE*2];
      tempo[0] = tempo2[0] = '\0';
      
      // note (up/down): on calcule à partir du lien primaire, ET du lien précédent.
      // ex: si on descend 2 fois on peut remonter 1 fois
      
      if (lienrelatif(tempo,fil,liens[liens[ptr]->premier]->fil)==0) {
        if (lienrelatif(tempo2,fil,liens[ptr]->fil)==0) {
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"build relative links to test: %s %s (with %s and %s)"LF,tempo,tempo2,liens[liens[ptr]->premier]->fil,liens[ptr]->fil);
            test_flush;
          }
          
          // si vient de primary, ne pas tester lienrelatif avec (car host "différent")
          /*if (liens[liens[ptr]->premier] == 0) {   // vient de primary
          }
          */
          
          // NEW: finalement OK, sauf pour les moved repérés par link_import
          // PROBLEME : annulé a cause d'un lien éventuel isolé accepté..qui entrainerait un miroir
          
          // (test même niveau (NOUVEAU à cause de certains problèmes de filtres non intégrés))
          // NEW
          if ( 
            (tempo[0]  != '\0' && tempo[1]  != '\0' && strchr(tempo+1,'/') == 0)
            ||
            (tempo2[0] != '\0' && tempo2[1] != '\0' && strchr(tempo2+1,'/') == 0) 
            ) {
            if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
              forbidden_url=0;
              if ((opt->debug>1) && (opt->log!=NULL)) {
                HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"same level link authorized: %s%s"LF,adr,fil);
                test_flush;
             }
            }
          }
          
          // down
          if ( (strncmp(tempo,"../",3)) || (strncmp(tempo2,"../",3)))  {   // pas montée sinon ne nbous concerne pas
            int test1,test2;
            if (!strncmp(tempo,"../",3))
              test1=0;
            else
              test1 = (strchr(tempo +((*tempo =='/')?1:0),'/')!=NULL);
            if (!strncmp(tempo2,"../",3))
              test2=0;
            else
              test2 = (strchr(tempo2+((*tempo2=='/')?1:0),'/')!=NULL);
            if ( (test1) && (test2) ) {   // on ne peut que descendre
              if ((opt->seeker & 1)==0) {  // interdiction de descendre
                forbidden_url=1;
                if ((opt->debug>1) && (opt->log!=NULL)) {
                  HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"lower link canceled: %s%s"LF,adr,fil);
                  test_flush;
                }
              } else {    // autorisé à priori - NEW
                if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
                  forbidden_url=0;
                  if ((opt->debug>1) && (opt->log!=NULL)) {
                    HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"lower link authorized: %s%s"LF,adr,fil);
                    test_flush;
                  }
                }
              }
            } else if ( (test1) || (test2) ) {   // on peut descendre pour accéder au lien
              if ((opt->seeker & 1)!=0) {  // on peut descendre - NEW
                if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
                  forbidden_url=0;
                  if ((opt->debug>1) && (opt->log!=NULL)) {
                    HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"lower link authorized: %s%s"LF,adr,fil);
                    test_flush;
                  }
                }
              }
            }
          }
          
          
          // up
          if ( (!strncmp(tempo,"../",3)) && (!strncmp(tempo2,"../",3)) ) {    // impossible sans monter
            if ((opt->seeker & 2)==0) {  // interdiction de monter
              forbidden_url=1;
              if ((opt->debug>1) && (opt->log!=NULL)) {
                HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"upper link canceled: %s%s"LF,adr,fil);
                test_flush;
              }
            } else {       // autorisé à monter - NEW
              if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
                forbidden_url=0;
                if ((opt->debug>1) && (opt->log!=NULL)) {
                  HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"upper link authorized: %s%s"LF,adr,fil);
                  test_flush;
                }
              }
            }
          } else if ( (!strncmp(tempo,"../",3)) || (!strncmp(tempo2,"../",3)) ) {    // Possible en montant
            if ((opt->seeker & 2)!=0) {  // autorisé à monter - NEW
              if (!liens[ptr]->link_import) {   // ne résulte pas d'un 'moved'
                forbidden_url=0;
                if ((opt->debug>1) && (opt->log!=NULL)) {
                  HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"upper link authorized: %s%s"LF,adr,fil);
                  test_flush;
                }
              }
            }  // sinon autorisé en descente
          }
          
          
        } else {
          if (opt->log) {
            fprintf(opt->log,"Error building relative link %s and %s"LF,fil,liens[ptr]->fil);
            test_flush;
          }
        }
      } else {
        if (opt->log) {
          fprintf(opt->log,"Error building relative link %s and %s"LF,fil,liens[liens[ptr]->premier]->fil);
          test_flush;
        }
      }
      
    }  // tester interdiction de descendre?
    
    {  // tester interdiction de monter
      char BIGSTK tempo[HTS_URLMAXSIZE*2];
      char BIGSTK tempo2[HTS_URLMAXSIZE*2];
      if (lienrelatif(tempo,fil,liens[liens[ptr]->premier]->fil)==0) {
        if (lienrelatif(tempo2,fil,liens[ptr]->fil)==0) {
        } else {
          if (opt->log) { 
            fprintf(opt->log,"Error building relative link %s and %s"LF,fil,liens[ptr]->fil);
            test_flush;
          }
          
        }
      } else {
        if (opt->log) { 
          fprintf(opt->log,"Error building relative link %s and %s"LF,fil,liens[liens[ptr]->premier]->fil);
          test_flush;
        }
        
      }
    }   // fin tester interdiction de monter
    
  } else {    // adresse différente, sortir?
    
    //if (!opt->wizard) {    // mode non wizard
    // doit-on traiter ce lien?.. vérifier droits de sortie
    switch((opt->travel & 255)) {
    case 0: 
      if (!opt->wizard)    // mode non wizard
        forbidden_url=1; break;    // interdicton de sortir au dela de l'adresse
    case 1: {              // sortie sur le même dom.xxx
      size_t i = strlen(adr)-1;
      size_t j = strlen(urladr)-1;
      while( (i>0) && (adr[i]!='.')) i--;
      while( (j>0) && (urladr[j]!='.')) j--;
      i--; j--;
      while( (i>0) && (adr[i]!='.')) i--;
      while( (j>0) && (urladr[j]!='.')) j--;
      if ((i>0) && (j>0)) {
        if (!strfield2(adr+i,urladr+j)) {   // !=
          if (!opt->wizard) {   // mode non wizard
            //printf("refused: %s\n",adr);
            forbidden_url=1;  // pas même domaine  
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"foreign domain link canceled: %s%s"LF,adr,fil);
              test_flush;
            }
          }
          
        } else {
          if (opt->wizard) {   // mode wizard
            forbidden_url=0;  // même domaine  
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"same domain link authorized: %s%s"LF,adr,fil);
              test_flush;
            }
          }
        }
        
      } else
        forbidden_url=1;
            } 
      break;  
    case 2: {                      // sortie sur le même .xxx
      size_t i = strlen(adr)-1;
      size_t j = strlen(urladr)-1;
      while( (i>0) && (adr[i]!='.')) i--;
      while( (j>0) && (urladr[j]!='.')) j--;
      if ((i>0) && (j>0)) {
        if (!strfield2(adr+i,urladr+j)) {   // !-
          if (!opt->wizard) {   // mode non wizard
            //printf("refused: %s\n",adr);
            forbidden_url=1;  // pas même .xx  
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"foreign location link canceled: %s%s"LF,adr,fil);
              test_flush;
            }
          }
        } else {
          if (opt->wizard) {   // mode wizard
            forbidden_url=0;  // même domaine  
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"same location link authorized: %s%s"LF,adr,fil);
              test_flush;
            }
          }
        }
      } else forbidden_url=1;     
            } 
      break;
    case 7:                 // everywhere!!
      if (opt->wizard) {   // mode wizard
        forbidden_url=0;
        break;
      }
    }  // switch
    
    // ANCIENNE POS -- récupérer les liens à côtés d'un lien (nearlink)
    
  }  // fin test adresse identique/différente

  // -------------------- PHASE 3 --------------------

  // récupérer les liens à côtés d'un lien (nearlink) (nvelle pos)
  if (forbidden_url != 0 && opt->nearlink) {
    if (!ishtml(opt,fil)) {  // non html
      //printf("ok %s%s\n",ad,fil);
      forbidden_url=0;    // autoriser
      may_set_prio_to=1+1; // set prio to 1 (parse but skip urls) if near is the winner
      if ((opt->debug>1) && (opt->log!=NULL)) {
        HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"near link authorized: %s%s"LF,adr,fil);
        test_flush;
      }
    }
  }

  // -------------------- PHASE 3-BIS --------------------

	/* Built-in known tags (<img src=..>, ..) */
	if (forbidden_url != 0 && embedded_triggered) {
		forbidden_url=0;    // autoriser
		may_set_prio_to=1+1; // set prio to 1 (parse but skip urls) if near is the winner
		if ((opt->debug>1) && (opt->log!=NULL)) {
			HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"near link authorized (friendly tag): %s%s"LF,adr,fil);
			test_flush;
		}
	}


  // -------------------- PHASE 4 --------------------
  
  // ------------------------------------------------------
  // Si wizard, il se peut qu'on autorise ou qu'on interdise 
  // un lien spécial avant même de tester sa position, sa hiérarchie etc.
  // peut court-circuiter le forbidden_url précédent
  if (opt->wizard) { // le wizard entre en action..
    //
    int question=1;         // poser une question                            
    int force_mirror=0;     // pour mirror links
    int filters_answer=0;   // décision prise par les filtres
    char BIGSTK l[HTS_URLMAXSIZE*2];
    char BIGSTK lfull[HTS_URLMAXSIZE*2];
    
    if (forbidden_url!=-1) question=0;  // pas de question, résolu
    
    // former URL complète du lien actuel
    strcpybuff(l,jump_identification(adr));
    if (*fil!='/') strcatbuff(l,"/");
    strcatbuff(l,fil);
    // full version (http://foo:[email protected]/bar.html)
    if (!link_has_authority(adr))
      strcpybuff(lfull,"http://");
    else
      lfull[0]='\0';
    strcatbuff(lfull,adr);
    if (*fil!='/') strcatbuff(lfull,"/");
    strcatbuff(lfull,fil);
    
    // tester filters (URLs autorisées ou interdites explicitement)
    
    // si lien primaire on saute le joker, on est pas lémur
    if (ptr==0) {  // lien primaire, autoriser
      question=1;    // la question sera résolue automatiquement
      forbidden_url=0;
      may_set_prio_to=0;    // clear may-set flag
    } else {
      // eternal depth first
      // vérifier récursivité extérieure
      if (opt->extdepth>0) {
        if ( /*question && */ (ptr>0) && (!force_mirror)) {
          // well, this is kinda a hak
          // we don't want to mirror EVERYTHING, and we have to decide where to stop
          // there is no way yet to tag "external" links, and therefore links that are
          // "weak" (authorized depth < external depth) are just not considered for external
          // hack
          if (liens[ptr]->depth > opt->extdepth) {
            // *set_prio_to = opt->extdepth + 1;
            *set_prio_to = 1 + (opt->extdepth);
            may_set_prio_to=0;  // clear may-set flag
            forbidden_url=0;    // autorisé
            question=0;         // résolution auto
            if ((opt->debug>1) && (opt->log!=NULL)) {
              if (question) {
                HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) ambiguous link accepted (external depth): link %s at %s%s"LF,l,urladr,urlfil);
              } else {
                HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) forced to accept link (external depth): link %s at %s%s"LF,l,urladr,urlfil);
              }
              test_flush;
            }
            
          }
        }
      }  
      
      // filters
      {
        int jok;
        char* mdepth="";
        // filters, 0=sait pas 1=ok -1=interdit
        {
          int jokDepth1=0,jokDepth2=0;
          int jok1=0,jok2=0;
          jok1  = fa_strjoker(/*url*/0, _FILTERS,*_FILTERS_PTR,lfull,NULL,NULL,&jokDepth1);
          jok2 =  fa_strjoker(/*url*/0, _FILTERS,*_FILTERS_PTR,l,    NULL,NULL,&jokDepth2);
          if (jok2 == 0) {      // #2 doesn't know
            jok = jok1;        // then, use #1
            mdepth = _FILTERS[jokDepth1];
          } else if (jok1 == 0) { // #1 doesn't know
            jok = jok2;        // then, use #2
            mdepth = _FILTERS[jokDepth2];
          } else if (jokDepth1 >= jokDepth2) { // #1 matching rule is "after" #2, then it is prioritary
            jok = jok1;
            mdepth = _FILTERS[jokDepth1];
          } else {                             // #2 matching rule is "after" #1, then it is prioritary
            jok = jok2;
            mdepth = _FILTERS[jokDepth2];
          }
        }
        
        if (jok == 1) {   // autorisé
          filters_answer=1;  // décision prise par les filtres
          question=0;    // ne pas poser de question, autorisé
          forbidden_url=0;  // URL autorisée
          may_set_prio_to=0;    // clear may-set flag
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) explicit authorized (%s) link: link %s at %s%s"LF,mdepth,l,urladr,urlfil);
            test_flush;
          }
        } else if (jok == -1) {  // forbidden
          filters_answer=1;  // décision prise par les filtres
          question=0;    // ne pas poser de question:
          forbidden_url=1;   // URL interdite
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) explicit forbidden (%s) link: link %s at %s%s"LF,mdepth,l,urladr,urlfil);
            test_flush;
          }
        }  // sinon on touche à rien
      }
    }
    
    // vérifier mode mirror links
    if (question) {
      if (opt->mirror_first_page) {    // mode mirror links
        if (liens[ptr]->precedent==0) {  // parent=primary!
          forbidden_url=0;    // autorisé
          may_set_prio_to=0;    // clear may-set flag
          question=1;         // résolution auto
          force_mirror=5;     // mirror (5)
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) explicit mirror link: link %s at %s%s"LF,l,urladr,urlfil);
            test_flush;
          }
        }
      }
    }
    
    // on doit poser la question.. peut on la poser?
    // (oui je sais quel preuve de délicatesse, merci merci)      
    if ((question) && (ptr>0) && (!force_mirror)) {
      if (opt->wizard==2) {    // éliminer tous les liens non répertoriés comme autorisés (ou inconnus)
        question=0;
        forbidden_url=1;
        if ((opt->debug>1) && (opt->log!=NULL)) {
          HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) ambiguous forbidden link: link %s at %s%s"LF,l,urladr,urlfil);
          test_flush;
        }
      }
    }
    
    // vérifier robots.txt
    if (opt->robots) {
      int r = checkrobots(_ROBOTS,adr,fil);
      if (r == -1) {    // interdiction
#if DEBUG_ROBOTS
        printf("robots.txt forbidden: %s%s\n",adr,fil);
#endif
        // question résolue, par les filtres, et mode robot non strict
        if ((!question) && (filters_answer) && (opt->robots == 1) && (forbidden_url!=1)) {
          r=0;    // annuler interdiction des robots
          if (!forbidden_url) {
            if ((opt->debug>1) && (opt->log!=NULL)) {
              HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Warning link followed against robots.txt: link %s at %s%s"LF,l,adr,fil);
              test_flush;
            }
          }
        }
        if (r == -1) {    // interdire
          forbidden_url=1;
          question=0;
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(robots.txt) forbidden link: link %s at %s%s"LF,l,adr,fil);
            test_flush;
          }
        }
      }
    }
    
    if (!question) {
      if ((opt->debug>1) && (opt->log!=NULL)) {
        if (!forbidden_url) {
          HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) shared foreign domain link: link %s at %s%s"LF,l,urladr,urlfil);
        } else {
          HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(wizard) cancelled foreign domain link: link %s at %s%s"LF,l,urladr,urlfil);
        }
        test_flush;
      }
#if BDEBUG==3
      printf("at %s in %s, wizard says: url %s ",urladr,urlfil,l);
      if (forbidden_url) printf("cancelled"); else printf(">SHARED<");
      printf("\n");
#endif 
    }

    /* en cas de question, ou lien primaire (enregistrer autorisations) */
    if (question || (ptr==0)) {
      const char* s;
      int n=0;
      
      // si primaire (plus bas) alors ...
      if ((ptr!=0) && (force_mirror==0)) {
        char BIGSTK tempo[HTS_URLMAXSIZE*2];
        tempo[0]='\0';
        strcatbuff(tempo,adr);
        strcatbuff(tempo,fil);
        s = RUN_CALLBACK1(opt, query3, tempo);
        if (strnotempty(s)==0)  // entrée
            n=0;
          else if (isdigit((unsigned char)*s))
            sscanf(s,"%d",&n);
          else {
            switch(*s) {
            case '*': n=-1; break;
            case '!': n=-999; {
              /*char *a;
              int i;                                    
              a=copie_de_adr-128;
              if (a<r.adr) a=r.adr;
              for(i=0;i<256;i++) {
                if (a==copie_de_adr) printf("\nHERE:\n");
                printf("%c",*a++);
              }
              printf("\n\n");
              */
                      }
              break;
            default: n=-999; printf("What did you say?\n"); break;
              
            } 
          }
        io_flush;
      } else {   // lien primaire: autoriser répertoire entier       
        if (!force_mirror) {
          if ((opt->seeker & 1)==0) {  // interdiction de descendre
            n=7;
          } else {
            n=5;   // autoriser miroir répertoires descendants (lien primaire)
          }
        } else   // forcer valeur (sub-wizard)
          n=force_mirror;
      }
      
      /* sanity check - reallocate filters HERE */
      if ((*_FILTERS_PTR) + 1 >= opt->maxfilter) {
        opt->maxfilter += HTS_FILTERSINC;
        if (filters_init(&_FILTERS, opt->maxfilter, HTS_FILTERSINC) == 0) {
          printf("PANIC! : Too many filters : >%d [%d]\n", (*_FILTERS_PTR),__LINE__);
          fflush(stdout);
          if (opt->log) {
            fprintf(opt->log,LF"Too many filters, giving up..(>%d)"LF, (*_FILTERS_PTR) );
            fprintf(opt->log,"To avoid that: use #F option for more filters (example: -#F5000)"LF);
            test_flush;
          }
          assertf("too many filters - giving up" == NULL);    // wild..
        }
      }

      // here we have enough room for a new filter if necessary
      switch(n) {
      case -1: // sauter tout le reste
        forbidden_url=1;
        opt->wizard=2;    // sauter tout le reste
        break;
      case 0:    // interdire les mêmes liens: adr/fil
        forbidden_url=1; 
        HT_INSERT_FILTERS0;    // insérer en 0
        strcpybuff(_FILTERS[0],"-");
        strcatbuff(_FILTERS[0],jump_identification(adr));
        if (*fil!='/') strcatbuff(_FILTERS[0],"/");
        strcatbuff(_FILTERS[0],fil);
        break;
        
      case 1: // éliminer répertoire entier et sous rép: adr/path/ *
        forbidden_url=1;
        {
          size_t i = strlen(fil)-1;
          while((fil[i]!='/') && (i>0)) i--;
          if (fil[i]=='/') {
            HT_INSERT_FILTERS0;    // insérer en 0
            strcpybuff(_FILTERS[0],"-");
            strcatbuff(_FILTERS[0],jump_identification(adr));
            if (*fil!='/') strcatbuff(_FILTERS[0],"/");
            strncatbuff(_FILTERS[0] ,fil,i);
            if (_FILTERS[0][strlen(_FILTERS[0])-1]!='/') 
              strcatbuff(_FILTERS[0],"/");
            strcatbuff(_FILTERS[0],"*");
          }
        }            
        
        // ** ...
        break;
        
      case 2:    // adresse adr*
        forbidden_url=1;
        HT_INSERT_FILTERS0;    // insérer en 0                                
        strcpybuff(_FILTERS[0],"-");
        strcatbuff(_FILTERS[0],jump_identification(adr));
        strcatbuff(_FILTERS[0],"*");
        break;
        
      case 3: // ** A FAIRE
        forbidden_url=1;
        /*
        {
        int i=strlen(adr)-1;
        while((adr[i]!='/') && (i>0)) i--;
        if (i>0) {
        
          }
          
      }*/
        
        break;
        //
      case 4:    // same link
        // PAS BESOIN!!
        /*HT_INSERT_FILTERS0;    // insérer en 0                                
        strcpybuff(_FILTERS[0],"+");
        strcatbuff(_FILTERS[0],adr);
        if (*fil!='/') strcatbuff(_FILTERS[0],"/");
        strcatbuff(_FILTERS[0],fil);*/
        
        
        // étant donné le renversement wizard/primary filter (les primary autorisent up/down ET interdisent)
        // il faut éviter d'un lien isolé effectue un miroir total..
        
        *set_prio_to = 0+1;    // niveau de récursion=0 (pas de miroir)
        
        break;
        
      case 5:    // autoriser répertoire entier et fils
        if ((opt->seeker & 2)==0) {  // interdiction de monter
          size_t i = strlen(fil)-1;
          while((fil[i]!='/') && (i>0)) i--;
          if (fil[i]=='/') {
            HT_INSERT_FILTERS0;    // insérer en 0                                
            strcpybuff(_FILTERS[0],"+");
            strcatbuff(_FILTERS[0],jump_identification(adr));
            if (*fil!='/') strcatbuff(_FILTERS[0],"/");
            strncatbuff(_FILTERS[0],fil,i+1);
            strcatbuff(_FILTERS[0],"*");
          }
        } else {    // autoriser domaine alors!!
          HT_INSERT_FILTERS0;    // insérer en 0                                strcpybuff(filters[filptr],"+");
          strcpybuff(_FILTERS[0],"+");
          strcatbuff(_FILTERS[0],jump_identification(adr));
          strcatbuff(_FILTERS[0],"*");
        }
        break;
        
      case 6:    // same domain
        HT_INSERT_FILTERS0;    // insérer en 0                                strcpybuff(filters[filptr],"+");
        strcpybuff(_FILTERS[0],"+");
        strcatbuff(_FILTERS[0],jump_identification(adr));
        strcatbuff(_FILTERS[0],"*");
        break;
        //
      case 7:    // autoriser ce répertoire
        {
          size_t i = strlen(fil)-1;
          while((fil[i]!='/') && (i>0)) i--;
          if (fil[i]=='/') {
            HT_INSERT_FILTERS0;    // insérer en 0                                
            strcpybuff(_FILTERS[0],"+");
            strcatbuff(_FILTERS[0],jump_identification(adr));
            if (*fil!='/') strcatbuff(_FILTERS[0],"/");
            strncatbuff(_FILTERS[0],fil,i+1);
            strcatbuff(_FILTERS[0],"*[file]");
          }
        }
        
        break;
        
      case 50:    // on fait rien
        break;
      }  // switch 
                              
    }  // test du wizard sur l'url
  }  // fin du test wizard..

  // -------------------- PHASE 5 --------------------

  // lien non autorisé, peut-on juste le tester?
  if (just_test_it) {
    if (forbidden_url==1) {
      if (opt->travel&256) {    // tester tout de même
        if (strfield(adr,"ftp://")==0
#if HTS_USEMMS
					&& strfield(adr,"mms://")==0
#endif
					) {    // PAS ftp!
          forbidden_url=1;    // oui oui toujours interdit (note: sert à rien car ==1 mais c pour comprendre)
          *just_test_it=1;     // mais on teste
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Testing link %s%s"LF,adr,fil);
          }
        }
      }
    }
    //adr[0]='\0';  // cancel
  }

  // -------------------- FINAL PHASE --------------------
  // Test if the "Near" test won
  if (may_set_prio_to && forbidden_url == 0) {
    *set_prio_to = may_set_prio_to;
  }

  return forbidden_url;
#undef _FILTERS
#undef _FILTERS_PTR
#undef _ROBOTS
}

コード例 #2

ファイルを表示

ファイル: htswizard.c プロジェクト: eatonmi/Crawler

// tester taille
int hts_testlinksize(httrackp* opt,
                     char* adr,char* fil,
                     LLint size) {
  int jok=0;
  if (size>=0) {
    char BIGSTK l[HTS_URLMAXSIZE*2];
    char BIGSTK lfull[HTS_URLMAXSIZE*2];
    if (size>=0) {
      LLint sz=size;
      int size_flag=0;
      
      // former URL complète du lien actuel
      strcpybuff(l,jump_identification(adr));
      if (*fil!='/') strcatbuff(l,"/");
      strcatbuff(l,fil);
      //
      if (!link_has_authority(adr))
        strcpybuff(lfull,"http://");
      else
        lfull[0]='\0';
      strcatbuff(lfull,adr);
      if (*fil!='/') strcatbuff(l,"/");
      strcatbuff(lfull,fil);
      
      // filters, 0=sait pas 1=ok -1=interdit
      {
        int jokDepth1=0,jokDepth2=0;
        int jok1=0,jok2=0;
        LLint sz1=size,sz2=size;
        int size_flag1=0,size_flag2=0;
        jok1  = fa_strjoker(/*url*/0, *opt->filters.filters,*opt->filters.filptr,lfull,&sz1,&size_flag1,&jokDepth1);
        jok2 =  fa_strjoker(/*url*/0, *opt->filters.filters,*opt->filters.filptr,l,    &sz2,&size_flag2,&jokDepth2);
        if (jok2 == 0) {      // #2 doesn't know
          jok = jok1;        // then, use #1
          sz = sz1;
          size_flag = size_flag1;
        } else if (jok1 == 0) {  // #1 doesn't know
          jok = jok2;        // then, use #2
          sz = sz2;
          size_flag = size_flag2;
        } else if (jokDepth1 >= jokDepth2) { // #1 matching rule is "after" #2, then it is prioritary
          jok = jok1;
          sz = sz1;
          size_flag = size_flag1;
        } else {                              // #2 matching rule is "after" #1, then it is prioritary
          jok = jok2;
          sz = sz2;
          size_flag = size_flag2;
        } 
      }
      

      // log
      if (jok==1) {
        if ((opt->debug>1) && (opt->log!=NULL)) {
          HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"File confirmed (size test): %s%s ("LLintP")"LF,adr,fil,(LLint)(size));
        }
      } else if (jok==-1) {
        if (size_flag) {        /* interdit à cause de la taille */
          if ((opt->debug>1) && (opt->log!=NULL)) {
            HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"File cancelled due to its size: %s%s ("LLintP", limit: "LLintP")"LF,adr,fil,(LLint)(size),(LLint)(sz));
          }
        } else {
          jok=1;
        }
      }
    }
  }
  return jok;
}

コード例 #3

ファイルを表示

ファイル: htstools.c プロジェクト: AmesianX/HackingStuff

// forme à partir d'un lien et du contexte (origin_fil et origin_adr d'où il est tiré) adr et fil
// [adr et fil sont des buffers de 1ko]
// 0 : ok
// -1 : erreur
// -2 : protocole non supporté (ftp)
int ident_url_relatif(const char *lien, const char *origin_adr,
                      const char *origin_fil,
                      lien_adrfil* const adrfil) {
  int ok = 0;
  int scheme = 0;

  assertf(adrfil != NULL);

  adrfil->adr[0] = '\0';
  adrfil->fil[0] = '\0';                //effacer buffers

  // lien non vide!
  if (strnotempty(lien) == 0)
    return -1;                  // erreur!

  // Scheme?
  {
    const char *a = lien;

    while(isalpha((unsigned char) *a))
      a++;
    if (*a == ':')
      scheme = 1;
  }

  // filtrer les parazites (mailto & cie)
  // scheme+authority (//)
  if ((strfield(lien, "http://"))       // scheme+//
      || (strfield(lien, "file://"))    // scheme+//
      || (strncmp(lien, "//", 2) == 0)  // // sans scheme (-> default)
    ) {
    if (ident_url_absolute(lien, adrfil) == -1) {
      ok = -1;                  // erreur URL
    }
  } else if (strfield(lien, "ftp://")) {
    // Note: ftp:foobar.gif is not valid
    if (ftp_available()) {      // ftp supporté
      if (ident_url_absolute(lien, adrfil) == -1) {
        ok = -1;                // erreur URL
      }
    } else {
      ok = -2;                  // non supporté
    }
#if HTS_USEOPENSSL
  } else if (strfield(lien, "https://")) {
    // Note: ftp:foobar.gif is not valid
    if (ident_url_absolute(lien, adrfil) == -1) {
      ok = -1;                // erreur URL
    }
#endif
  } else if ((scheme) && ((!strfield(lien, "http:"))
                          && (!strfield(lien, "https:"))
                          && (!strfield(lien, "ftp:"))
             )) {
    ok = -1;                    // unknown scheme
  } else {                      // c'est un lien relatif
    // On forme l'URL complète à partie de l'url actuelle
    // et du chemin actuel si besoin est.

    // sanity check
    if (origin_adr == NULL || origin_fil == NULL 
      || *origin_adr == '\0' || *origin_fil == '\0') {
      return -1;
    }

    // copier adresse
    if (((int) strlen(origin_adr) < HTS_URLMAXSIZE)
        && ((int) strlen(origin_fil) < HTS_URLMAXSIZE)
        && ((int) strlen(lien) < HTS_URLMAXSIZE)) {

      /* patch scheme if necessary */
      if (strfield(lien, "http:")) {
        lien += 5;
        strcpybuff(adrfil->adr, jump_protocol_const(origin_adr));     // même adresse ; protocole vide (http)
      } else if (strfield(lien, "https:")) {
        lien += 6;
        strcpybuff(adrfil->adr, "https://");    // même adresse forcée en https
        strcatbuff(adrfil->adr, jump_protocol_const(origin_adr));
      } else if (strfield(lien, "ftp:")) {
        lien += 4;
        strcpybuff(adrfil->adr, "ftp://");      // même adresse forcée en ftp
        strcatbuff(adrfil->adr, jump_protocol_const(origin_adr));
      } else {
        strcpybuff(adrfil->adr, origin_adr);    // même adresse ; et même éventuel protocole
      }

      if (*lien != '/') {       // sinon c'est un lien absolu
        if (*lien == '\0') {
          strcpybuff(adrfil->fil, origin_fil);
        } else if (*lien == '?') {      // example: a href="?page=2"
          char *a;

          strcpybuff(adrfil->fil, origin_fil);
          a = strchr(adrfil->fil, '?');
          if (a)
            *a = '\0';
          strcatbuff(adrfil->fil, lien);
        } else {
          const char *a = strchr(origin_fil, '?');

          if (a == NULL)
            a = origin_fil + strlen(origin_fil);
          while((*a != '/') && (a > origin_fil))
            a--;
          if (*a == '/') {      // ok on a un '/'
            if ((((int) (a - origin_fil)) + 1 + strlen(lien)) < HTS_URLMAXSIZE) {
              // copier chemin
              strncpy(adrfil->fil, origin_fil, ((int) (a - origin_fil)) + 1);
              *(adrfil->fil + ((int) (a - origin_fil)) + 1) = '\0';

              // copier chemin relatif
              if (((int) strlen(adrfil->fil) + (int) strlen(lien)) < HTS_URLMAXSIZE) {
                strcatbuff(adrfil->fil, lien + ((*lien == '/') ? 1 : 0));
                // simplifier url pour les ../
                fil_simplifie(adrfil->fil);
              } else
                ok = -1;        // erreur
            } else {            // erreur
              ok = -1;          // erreur URL
            }
          } else {              // erreur
            ok = -1;            // erreur URL
          }
        }
      } else {                  // chemin absolu
        // copier chemin directement
        strcatbuff(adrfil->fil, lien);
        fil_simplifie(adrfil->fil);
      }                         // *lien!='/'
    } else
      ok = -1;

  }                             // test news: etc.

  // case insensitive pour adresse
  {
    char *a = jump_identification(adrfil->adr);

    while(*a) {
      if ((*a >= 'A') && (*a <= 'Z'))
        *a += 'a' - 'A';
      a++;
    }
  }

  // IDNA / RFC 3492 (Punycode) handling for HTTP(s)
  if (!link_has_authority(adrfil->adr) || strfield(adrfil->adr, "https:")) {
    char *const a = jump_identification(adrfil->adr);
    // Non-ASCII characters (theorically forbidden, but browsers are lenient)
    if (!hts_isStringAscii(a, strlen(a))) {
      char *const idna = hts_convertStringUTF8ToIDNA(a, strlen(a));
      if (idna != NULL) {
        if (strlen(idna) < HTS_URLMAXSIZE) {
          strcpybuff(a, idna);
        }
        free(idna);
      }
    }
  }

  return ok;
}