static int text_is_extended(u_char c) { if (c >= 128) return (1); return (text_is_ascii(c)); }
static int text_is_latin1(u_char c) { if (c >= 160) return (1); return (text_is_ascii(c)); }
int qr_code_data_list_extract_text(const qr_code_data_list *_qrlist, zbar_image_scanner_t *iscn, zbar_image_t *img) { #if HAVE_ICONV iconv_t sjis_cd; iconv_t utf8_cd; iconv_t latin1_cd; #endif const qr_code_data *qrdata; int nqrdata; unsigned char *mark; int ntext; int i; qrdata=_qrlist->qrdata; nqrdata=_qrlist->nqrdata; mark=(unsigned char *)calloc(nqrdata,sizeof(*mark)); ntext=0; #if HAVE_ICONV /*This is the encoding the standard says is the default.*/ latin1_cd=iconv_open("UTF-8","ISO8859-1"); /*But this one is often used, as well.*/ sjis_cd=iconv_open("UTF-8","SJIS"); /*This is a trivial conversion just to check validity without extra code.*/ utf8_cd=iconv_open("UTF-8","UTF-8"); #endif for(i=0;i<nqrdata;i++)if(!mark[i]){ const qr_code_data *qrdataj; const qr_code_data_entry *entry; #if HAVE_ICONV iconv_t enc_list[3]; iconv_t eci_cd; #endif int sa[16]; int sa_size; char *sa_text; size_t sa_ntext; size_t sa_ctext; int fnc1; int eci; int err; int j; int k; /*Step 0: Collect the other QR codes belonging to this S-A group.*/ if(qrdata[i].sa_size){ unsigned sa_parity; sa_size=qrdata[i].sa_size; sa_parity=qrdata[i].sa_parity; for(j=0;j<sa_size;j++)sa[j]=-1; for(j=i;j<nqrdata;j++)if(!mark[j]){ /*TODO: We could also match version, ECC level, etc. if size and parity alone are too ambiguous.*/ if(qrdata[j].sa_size==sa_size&&qrdata[j].sa_parity==sa_parity&& sa[qrdata[j].sa_index]<0){ sa[qrdata[j].sa_index]=j; mark[j]=1; } } /*TODO: If the S-A group is complete, check the parity.*/ } else{ sa[0]=i; sa_size=1; } sa_ctext=0; fnc1=0; /*Step 1: Detect FNC1 markers and estimate the required buffer size.*/ for(j=0;j<sa_size;j++)if(sa[j]>=0){ qrdataj=qrdata+sa[j]; for(k=0;k<qrdataj->nentries;k++){ int shift; entry=qrdataj->entries+k; shift=0; switch(entry->mode){ /*FNC1 applies to the entire code and ignores subsequent markers.*/ case QR_MODE_FNC1_1ST: case QR_MODE_FNC1_2ND:fnc1=1;break; /*2 SJIS bytes will be at most 4 UTF-8 bytes.*/ case QR_MODE_KANJI:shift++; /*We assume at most 4 UTF-8 bytes per input byte. I believe this is true for all the encodings we actually use.*/ case QR_MODE_BYTE:shift++; default:{ /*The remaining two modes are already valid UTF-8.*/ if(QR_MODE_HAS_DATA(entry->mode)){ sa_ctext+=entry->payload.data.len<<shift; } }break; } } } /*Step 2: Convert the entries.*/ sa_text=(char *)malloc((sa_ctext+1)*sizeof(*sa_text)); sa_ntext=0; eci=-1; #if HAVE_ICONV enc_list[0]=sjis_cd; enc_list[1]=latin1_cd; enc_list[2]=utf8_cd; eci_cd=(iconv_t)-1; #endif err=0; zbar_symbol_t *syms = NULL, **sym = &syms; for(j = 0; j < sa_size && !err; j++, sym = &(*sym)->next) { *sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0); (*sym)->datalen = sa_ntext; if(sa[j]<0){ /* generic placeholder for unfinished results */ (*sym)->type = ZBAR_PARTIAL; /*Skip all contiguous missing segments.*/ for(j++;j<sa_size&&sa[j]<0;j++); /*If there aren't any more, stop.*/ if(j>=sa_size)break; /* mark break in data */ sa_text[sa_ntext++]='\0'; (*sym)->datalen = sa_ntext; /* advance to next symbol */ sym = &(*sym)->next; *sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0); } qrdataj=qrdata+sa[j]; /* expose bounding box */ sym_add_point(*sym, qrdataj->bbox[0][0], qrdataj->bbox[0][1]); sym_add_point(*sym, qrdataj->bbox[2][0], qrdataj->bbox[2][1]); sym_add_point(*sym, qrdataj->bbox[3][0], qrdataj->bbox[3][1]); sym_add_point(*sym, qrdataj->bbox[1][0], qrdataj->bbox[1][1]); printf("Decoding %i symbols.\n", qrdataj->nentries); for(k=0;k<qrdataj->nentries&&!err;k++){ size_t inleft; char *in; size_t outleft; char *out; entry=qrdataj->entries+k; printf("Symbol %i mode: %i\n", k, entry->mode); switch(entry->mode){ case QR_MODE_NUM:{ if(sa_ctext-sa_ntext>=(size_t)entry->payload.data.len){ memcpy(sa_text+sa_ntext,entry->payload.data.buf, entry->payload.data.len*sizeof(*sa_text)); sa_ntext+=entry->payload.data.len; } else err=1; }break; case QR_MODE_ALNUM:{ char *p; in=(char *)entry->payload.data.buf; inleft=entry->payload.data.len; /*FNC1 uses '%' as an escape character.*/ if(fnc1)for(;;){ size_t plen; char c; p=memchr(in,'%',inleft*sizeof(*in)); if(p==NULL)break; plen=p-in; if(sa_ctext-sa_ntext<plen+1)break; memcpy(sa_text+sa_ntext,in,plen*sizeof(*in)); sa_ntext+=plen; /*Two '%'s is a literal '%'*/ if(plen+1<inleft&&p[1]=='%'){ c='%'; plen++; p++; } /*One '%' is the ASCII group separator.*/ else c=0x1D; sa_text[sa_ntext++]=c; inleft-=plen+1; in=p+1; } else p=NULL; if(p!=NULL||sa_ctext-sa_ntext<inleft)err=1; else{ memcpy(sa_text+sa_ntext,in,inleft*sizeof(*sa_text)); sa_ntext+=inleft; } }break; /*TODO: This will not handle a multi-byte sequence split between multiple data blocks. Does such a thing occur? Is it allowed? It requires copying buffers around to handle correctly.*/ case QR_MODE_BYTE:{ in=(char *)entry->payload.data.buf; inleft=entry->payload.data.len; #if !HAVE_ICONV out=sa_text+sa_ntext; outleft=sa_ctext-sa_ntext; strncpy(out, in, inleft); sa_ntext+=inleft; printf("Copied %u bytes.\n", (unsigned)inleft); #else out=sa_text+sa_ntext; outleft=sa_ctext-sa_ntext; /*If we have no specified encoding, attempt to auto-detect it.*/ if(eci<0){ int ei; /*First check for the UTF-8 BOM.*/ if(inleft>=3&& in[0]==(char)0xEF&&in[1]==(char)0xBB&&in[2]==(char)0xBF){ in+=3; inleft-=3; /*Actually try converting (to check validity).*/ err=utf8_cd==(iconv_t)-1|| iconv(utf8_cd,&in,&inleft,&out,&outleft)==(size_t)-1; if(!err){ sa_ntext=out-sa_text; enc_list_mtf(enc_list,utf8_cd); continue; } in=(char *)entry->payload.data.buf; inleft=entry->payload.data.len; out=sa_text+sa_ntext; outleft=sa_ctext-sa_ntext; } /*If the text is 8-bit clean, prefer UTF-8 over SJIS, since SJIS will corrupt the backslashes used for DoCoMo formats.*/ else if(text_is_ascii((unsigned char *)in,inleft)){ enc_list_mtf(enc_list,utf8_cd); } /*Try our list of encodings.*/ for(ei=0;ei<3;ei++)if(enc_list[ei]!=(iconv_t)-1){ /*According to the standard, ISO/IEC 8859-1 (one hyphen) is supposed to be used, but reality is not always so. It's got an invalid range that is used often with SJIS and UTF-8, though, which makes detection easier. However, iconv() does not properly reject characters in those ranges, since ISO-8859-1 (two hyphens) defines a number of seldom-used control code characters there. So if we see any of those characters, move this conversion to the end of the list.*/ if(ei<2&&enc_list[ei]==latin1_cd&& !text_is_latin1((unsigned char *)in,inleft)){ int ej; for(ej=ei+1;ej<3;ej++)enc_list[ej-1]=enc_list[ej]; enc_list[2]=latin1_cd; } err=iconv(enc_list[ei],&in,&inleft,&out,&outleft)==(size_t)-1; if(!err){ sa_ntext=out-sa_text; enc_list_mtf(enc_list,enc_list[ei]); break; } in=(char *)entry->payload.data.buf; inleft=entry->payload.data.len; out=sa_text+sa_ntext; outleft=sa_ctext-sa_ntext; } } /*We were actually given a character set; use it.*/ else{ err=eci_cd==(iconv_t)-1|| iconv(eci_cd,&in,&inleft,&out,&outleft)==(size_t)-1; if(!err)sa_ntext=out-sa_text; } #endif }break; /*Kanji mode always uses SJIS.*/ case QR_MODE_KANJI:{ in=(char *)entry->payload.data.buf; inleft=entry->payload.data.len; #if HAVE_ICONV out=sa_text+sa_ntext; outleft=sa_ctext-sa_ntext; err=sjis_cd==(iconv_t)-1|| iconv(sjis_cd,&in,&inleft,&out,&outleft)==(size_t)-1; if(!err)sa_ntext=out-sa_text; #endif }break; #if HAVE_ICONV /*Check to see if a character set was specified.*/ case QR_MODE_ECI:{ const char *enc; char buf[16]; unsigned cur_eci; cur_eci=entry->payload.eci; if(cur_eci<=QR_ECI_ISO8859_16&&cur_eci!=14){ if(cur_eci!=QR_ECI_GLI0&&cur_eci!=QR_ECI_CP437){ sprintf(buf,"ISO8859-%i",QR_MAXI(cur_eci,3)-2); enc=buf; } /*Note that CP437 requires an iconv compiled with --enable-extra-encodings, and thus may not be available.*/ else enc="CP437"; } else if(cur_eci==QR_ECI_SJIS)enc="SJIS"; /*Don't know what this ECI code specifies, but not an encoding that we recognize.*/ else continue; eci=cur_eci; eci_cd=iconv_open("UTF-8",enc); }break; #endif /*Silence stupid compiler warnings.*/ default:break; } } /*If eci should be reset between codes, do so.*/ if(eci<=QR_ECI_GLI1){ eci=-1; #if HAVE_ICONV if(eci_cd!=(iconv_t)-1)iconv_close(eci_cd); #endif } } #if HAVE_ICONV if(eci_cd!=(iconv_t)-1)iconv_close(eci_cd); #endif if(!err){ sa_text[sa_ntext++]='\0'; if(sa_ctext+1>sa_ntext){ sa_text=(char *)realloc(sa_text,sa_ntext*sizeof(*sa_text)); } zbar_symbol_t *sa_sym; if(sa_size == 1) sa_sym = syms; else { /* create "virtual" container symbol for composite result */ sa_sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0); sa_sym->syms = _zbar_symbol_set_create(); sa_sym->syms->head = syms; /* cheap out w/axis aligned bbox for now */ int xmin = img->width, xmax = -2; int ymin = img->height, ymax = -2; /* fixup data references */ for(; syms; syms = syms->next) { _zbar_symbol_refcnt(syms, 1); if(syms->type == ZBAR_PARTIAL) sa_sym->type = ZBAR_PARTIAL; else for(j = 0; j < (int)syms->npts; j++) { int u = syms->pts[j].x; if(xmin >= u) xmin = u - 1; if(xmax <= u) xmax = u + 1; u = syms->pts[j].y; if(ymin >= u) ymin = u - 1; if(ymax <= u) ymax = u + 1; } syms->data = sa_text + syms->datalen; int next = (syms->next) ? syms->next->datalen : sa_ntext; assert(next > (int)syms->datalen); syms->datalen = next - syms->datalen - 1; } if(xmax >= -1) { sym_add_point(sa_sym, xmin, ymin); sym_add_point(sa_sym, xmin, ymax); sym_add_point(sa_sym, xmax, ymax); sym_add_point(sa_sym, xmax, ymin); } } sa_sym->data = sa_text; sa_sym->data_alloc = sa_ntext; sa_sym->datalen = sa_ntext - 1; _zbar_image_scanner_add_sym(iscn, sa_sym); } else { _zbar_image_scanner_recycle_syms(iscn, syms); free(sa_text); } } #if HAVE_ICONV if(utf8_cd!=(iconv_t)-1)iconv_close(utf8_cd); if(sjis_cd!=(iconv_t)-1)iconv_close(sjis_cd); if(latin1_cd!=(iconv_t)-1)iconv_close(latin1_cd); #endif free(mark); return ntext; }