void QgsDelimitedTextProvider::scanFile( bool buildIndexes ) { QStringList messages; // assume the layer is invalid until proven otherwise mLayerValid = false; mValid = false; mRescanRequired = false; clearInvalidLines(); // Initiallize indexes resetIndexes(); bool buildSpatialIndex = buildIndexes && nullptr != mSpatialIndex; // No point building a subset index if there is no geometry, as all // records will be included. bool buildSubsetIndex = buildIndexes && mBuildSubsetIndex && mGeomRep != GeomNone; if ( ! mFile->isValid() ) { // uri is invalid so the layer must be too... messages.append( tr( "File cannot be opened or delimiter parameters are not valid" ) ); reportErrors( messages ); QgsDebugMsg( "Delimited text source invalid - filename or delimiter parameters" ); return; } // Open the file and get number of rows, etc. We assume that the // file has a header row and process accordingly. Caller should make // sure that the delimited file is properly formed. if ( mGeomRep == GeomAsWkt ) { mWktFieldIndex = mFile->fieldIndex( mWktFieldName ); if ( mWktFieldIndex < 0 ) { messages.append( tr( "%0 field %1 is not defined in delimited text file" ).arg( QStringLiteral( "Wkt" ), mWktFieldName ) ); } } else if ( mGeomRep == GeomAsXy ) { mXFieldIndex = mFile->fieldIndex( mXFieldName ); mYFieldIndex = mFile->fieldIndex( mYFieldName ); if ( mXFieldIndex < 0 ) { messages.append( tr( "%0 field %1 is not defined in delimited text file" ).arg( QStringLiteral( "X" ), mWktFieldName ) ); } if ( mYFieldIndex < 0 ) { messages.append( tr( "%0 field %1 is not defined in delimited text file" ).arg( QStringLiteral( "Y" ), mWktFieldName ) ); } } if ( !messages.isEmpty() ) { reportErrors( messages ); QgsDebugMsg( "Delimited text source invalid - missing geometry fields" ); return; } // Scan the entire file to determine // 1) the number of fields (this is handled by QgsDelimitedTextFile mFile // 2) the number of valid features. Note that the selection of valid features // should match the code in QgsDelimitedTextFeatureIterator // 3) the geometric extents of the layer // 4) the type of each field // // Also build subset and spatial indexes. QStringList parts; long nEmptyRecords = 0; long nBadFormatRecords = 0; long nIncompatibleGeometry = 0; long nInvalidGeometry = 0; long nEmptyGeometry = 0; mNumberFeatures = 0; mExtent = QgsRectangle(); QList<bool> isEmpty; QList<bool> couldBeInt; QList<bool> couldBeLongLong; QList<bool> couldBeDouble; bool foundFirstGeometry = false; while ( true ) { QgsDelimitedTextFile::Status status = mFile->nextRecord( parts ); if ( status == QgsDelimitedTextFile::RecordEOF ) break; if ( status != QgsDelimitedTextFile::RecordOk ) { nBadFormatRecords++; recordInvalidLine( tr( "Invalid record format at line %1" ) ); continue; } // Skip over empty records if ( recordIsEmpty( parts ) ) { nEmptyRecords++; continue; } // Check geometries are valid bool geomValid = true; if ( mGeomRep == GeomAsWkt ) { if ( mWktFieldIndex >= parts.size() || parts[mWktFieldIndex].isEmpty() ) { nEmptyGeometry++; mNumberFeatures++; } else { // Get the wkt - confirm it is valid, get the type, and // if compatible with the rest of file, add to the extents QString sWkt = parts[mWktFieldIndex]; QgsGeometry geom; if ( !mWktHasPrefix && sWkt.indexOf( sWktPrefixRegexp ) >= 0 ) mWktHasPrefix = true; geom = geomFromWkt( sWkt, mWktHasPrefix ); if ( !geom.isNull() ) { QgsWkbTypes::Type type = geom.wkbType(); if ( type != QgsWkbTypes::NoGeometry ) { if ( mGeometryType == QgsWkbTypes::UnknownGeometry || geom.type() == mGeometryType ) { mGeometryType = geom.type(); if ( !foundFirstGeometry ) { mNumberFeatures++; mWkbType = type; mExtent = geom.boundingBox(); foundFirstGeometry = true; } else { mNumberFeatures++; if ( geom.isMultipart() ) mWkbType = type; QgsRectangle bbox( geom.boundingBox() ); mExtent.combineExtentWith( bbox ); } if ( buildSpatialIndex ) { QgsFeature f; f.setId( mFile->recordId() ); f.setGeometry( geom ); mSpatialIndex->insertFeature( f ); } } else { nIncompatibleGeometry++; geomValid = false; } } } else { geomValid = false; nInvalidGeometry++; recordInvalidLine( tr( "Invalid WKT at line %1" ) ); } } } else if ( mGeomRep == GeomAsXy ) { // Get the x and y values, first checking to make sure they // aren't null. QString sX = mXFieldIndex < parts.size() ? parts[mXFieldIndex] : QString(); QString sY = mYFieldIndex < parts.size() ? parts[mYFieldIndex] : QString(); if ( sX.isEmpty() && sY.isEmpty() ) { nEmptyGeometry++; mNumberFeatures++; } else { QgsPointXY pt; bool ok = pointFromXY( sX, sY, pt, mDecimalPoint, mXyDms ); if ( ok ) { if ( foundFirstGeometry ) { mExtent.combineExtentWith( pt.x(), pt.y() ); } else { // Extent for the first point is just the first point mExtent.set( pt.x(), pt.y(), pt.x(), pt.y() ); mWkbType = QgsWkbTypes::Point; mGeometryType = QgsWkbTypes::PointGeometry; foundFirstGeometry = true; } mNumberFeatures++; if ( buildSpatialIndex && std::isfinite( pt.x() ) && std::isfinite( pt.y() ) ) { QgsFeature f; f.setId( mFile->recordId() ); f.setGeometry( QgsGeometry::fromPointXY( pt ) ); mSpatialIndex->insertFeature( f ); } } else { geomValid = false; nInvalidGeometry++; recordInvalidLine( tr( "Invalid X or Y fields at line %1" ) ); } } } else { mWkbType = QgsWkbTypes::NoGeometry; mNumberFeatures++; } if ( !geomValid ) continue; if ( buildSubsetIndex ) mSubsetIndex.append( mFile->recordId() ); // If we are going to use this record, then assess the potential types of each column for ( int i = 0; i < parts.size(); i++ ) { QString &value = parts[i]; // Ignore empty fields - spreadsheet generated CSV files often // have random empty fields at the end of a row if ( value.isEmpty() ) continue; // Expand the columns to include this non empty field if necessary while ( couldBeInt.size() <= i ) { isEmpty.append( true ); couldBeInt.append( false ); couldBeLongLong.append( false ); couldBeDouble.append( false ); } // If this column has been empty so far then initiallize it // for possible types if ( isEmpty[i] ) { isEmpty[i] = false; couldBeInt[i] = true; couldBeLongLong[i] = true; couldBeDouble[i] = true; } if ( ! mDetectTypes ) { continue; } // Now test for still valid possible types for the field // Types are possible until first record which cannot be parsed if ( couldBeInt[i] ) { value.toInt( &couldBeInt[i] ); } if ( couldBeLongLong[i] && ! couldBeInt[i] ) { value.toLongLong( &couldBeLongLong[i] ); } if ( couldBeDouble[i] && ! couldBeLongLong[i] ) { if ( ! mDecimalPoint.isEmpty() ) { value.replace( mDecimalPoint, QLatin1String( "." ) ); } value.toDouble( &couldBeDouble[i] ); } } } // Now create the attribute fields. Field types are integer by preference, // failing that double, failing that text. QStringList fieldNames = mFile->fieldNames(); mFieldCount = fieldNames.size(); attributeColumns.clear(); attributeFields.clear(); QString csvtMessage; QStringList csvtTypes = readCsvtFieldTypes( mFile->fileName(), &csvtMessage ); for ( int i = 0; i < fieldNames.size(); i++ ) { // Skip over WKT field ... don't want to display in attribute table if ( i == mWktFieldIndex ) continue; // Add the field index lookup for the column attributeColumns.append( i ); QVariant::Type fieldType = QVariant::String; QString typeName = QStringLiteral( "text" ); if ( i < csvtTypes.size() ) { typeName = csvtTypes[i]; } else if ( mDetectTypes && i < couldBeInt.size() ) { if ( couldBeInt[i] ) { typeName = QStringLiteral( "integer" ); } else if ( couldBeLongLong[i] ) { typeName = QStringLiteral( "longlong" ); } else if ( couldBeDouble[i] ) { typeName = QStringLiteral( "double" ); } } if ( typeName == QStringLiteral( "integer" ) ) { fieldType = QVariant::Int; } else if ( typeName == QStringLiteral( "longlong" ) ) { fieldType = QVariant::LongLong; } else if ( typeName == QStringLiteral( "real" ) || typeName == QStringLiteral( "double" ) ) { typeName = QStringLiteral( "double" ); fieldType = QVariant::Double; } else { typeName = QStringLiteral( "text" ); } attributeFields.append( QgsField( fieldNames[i], fieldType, typeName ) ); } QgsDebugMsg( "Field count for the delimited text file is " + QString::number( attributeFields.size() ) ); QgsDebugMsg( "geometry type is: " + QString::number( mWkbType ) ); QgsDebugMsg( "feature count is: " + QString::number( mNumberFeatures ) ); QStringList warnings; if ( ! csvtMessage.isEmpty() ) warnings.append( csvtMessage ); if ( nBadFormatRecords > 0 ) warnings.append( tr( "%1 records discarded due to invalid format" ).arg( nBadFormatRecords ) ); if ( nEmptyGeometry > 0 ) warnings.append( tr( "%1 records have missing geometry definitions" ).arg( nEmptyGeometry ) ); if ( nInvalidGeometry > 0 ) warnings.append( tr( "%1 records discarded due to invalid geometry definitions" ).arg( nInvalidGeometry ) ); if ( nIncompatibleGeometry > 0 ) warnings.append( tr( "%1 records discarded due to incompatible geometry types" ).arg( nIncompatibleGeometry ) ); reportErrors( warnings ); // Decide whether to use subset ids to index records rather than simple iteration through all // If more than 10% of records are being skipped, then use index. (Not based on any experimentation, // could do with some analysis?) if ( buildSubsetIndex ) { long recordCount = mFile->recordCount(); recordCount -= recordCount / SUBSET_ID_THRESHOLD_FACTOR; mUseSubsetIndex = mSubsetIndex.size() < recordCount; if ( ! mUseSubsetIndex ) mSubsetIndex = QList<quintptr>(); } mUseSpatialIndex = buildSpatialIndex; mValid = mGeometryType != QgsWkbTypes::UnknownGeometry; mLayerValid = mValid; // If it is valid, then watch for changes to the file connect( mFile.get(), &QgsDelimitedTextFile::fileUpdated, this, &QgsDelimitedTextProvider::onFileUpdated ); }