Skip to content

Commit

Permalink
QgsDelimitedTextFile: fix parsing of files with CR end of line
Browse files Browse the repository at this point in the history
Fixes #36392
Fixes #21976
Fixes #17190

We are obliged to do 'at hand' parsing due to QT not handling CR-only
end of lines.
As we are at it, also limit each line to 1 MB to avoid potential denial
of service (which was what close to what happened here before the CR-only
parsing fix)

Add tests for parsing CR-only end of lines, and exercising the at-hand
buffering logic

(cherry picked from commit 644a564)
  • Loading branch information
rouault authored and nyalldawson committed Jul 21, 2020
1 parent d09a4f2 commit c6b2ee0
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 6 deletions.
85 changes: 79 additions & 6 deletions src/providers/delimitedtext/qgsdelimitedtextfile.cpp
Expand Up @@ -39,6 +39,10 @@ QgsDelimitedTextFile::QgsDelimitedTextFile( const QString &url )
// The default type is CSV
setTypeCSV();
if ( ! url.isNull() ) setFromUrl( url );

// For tests
QString bufferSizeStr( getenv( "QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE" ) );
mMaxBufferSize = bufferSizeStr.isEmpty() ? 1024 * 1024 : bufferSizeStr.toInt();
}


Expand Down Expand Up @@ -530,7 +534,6 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextRecord( QStringList &reco
return status;
}


QgsDelimitedTextFile::Status QgsDelimitedTextFile::reset()
{
// Make sure the file is valid open
Expand All @@ -541,12 +544,14 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::reset()
mLineNumber = 0;
mRecordNumber = -1;
mRecordLineNumber = -1;
mBuffer = QString();
mPosInBuffer = 0;

// Skip header lines
for ( int i = mSkipLines; i-- > 0; )
{
if ( mStream->readLine().isNull() ) return RecordEOF;
mLineNumber++;
QString ignoredContent;
if ( nextLine( ignoredContent ) == RecordEOF ) return RecordEOF;
}
// Read the column names
Status result = RecordOk;
Expand All @@ -567,11 +572,79 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextLine( QString &buffer, bo
Status status = reset();
if ( status != RecordOk ) return status;
}
if ( mLineNumber == 0 )
{
mPosInBuffer = 0;
mBuffer = mStream->read( mMaxBufferSize );
}

while ( ! mStream->atEnd() )
while ( !mBuffer.isEmpty() )
{
buffer = mStream->readLine();
if ( buffer.isNull() ) break;
// Identify position of \r , \n or \r\n
// We should rather use mStream->readLine(), but it fails to detect \r
// line endings.
int eolPos = mBuffer.indexOf( '\r', mPosInBuffer );
int nextPos = 0;
if ( eolPos >= 0 )
{
nextPos = eolPos + 1;
// Check if there is a \n just afterwards
if ( eolPos + 1 < mBuffer.size() )
{
if ( mBuffer[eolPos + 1] == '\n' )
{
nextPos = eolPos + 2;
}
}
else
{
// If we are just at the end of the buffer, read an extra character
// from the stream
QString newChar = mStream->read( 1 );
mBuffer += newChar;
if ( newChar == '\n' )
{
nextPos = eolPos + 2;
}
}
}
else
{
eolPos = mBuffer.indexOf( '\n', mPosInBuffer );
if ( eolPos >= 0 )
{
nextPos = eolPos + 1;
}
}
if ( eolPos < 0 )
{
if ( mPosInBuffer == 0 )
{
// If our current position was the beginning of the buffer and we
// didn't find any end of line character, then return the whole buffer
// (to avoid unbounded line sizes)
// and set the buffer to null so that we don't iterate any more.
buffer = mBuffer;
mBuffer = QString();
}
else
{
// Read more bytes from file to have up to mMaxBufferSize characters
// in our buffer (after having subset it from mPosInBuffer)
mBuffer = mBuffer.mid( mPosInBuffer );
mBuffer += mStream->read( mMaxBufferSize - mBuffer.size() );
mPosInBuffer = 0;
continue;
}
}
else
{
// Extract the current line from the buffer
buffer = mBuffer.mid( mPosInBuffer, eolPos - mPosInBuffer );
// Update current position in buffer to be the one next to the end of
// line character(s)
mPosInBuffer = nextPos;
}
mLineNumber++;
if ( skipBlank && buffer.isEmpty() ) continue;
return RecordOk;
Expand Down
3 changes: 3 additions & 0 deletions src/providers/delimitedtext/qgsdelimitedtextfile.h
Expand Up @@ -428,6 +428,9 @@ class QgsDelimitedTextFile : public QObject
long mLineNumber = -1;
long mRecordLineNumber = -1;
long mRecordNumber = -1;
QString mBuffer;
int mPosInBuffer = 0;
int mMaxBufferSize = 0;
QStringList mCurrentRecord;
bool mHoldCurrentRecord = false;
// Maximum number of record (ie maximum record number visited)
Expand Down
56 changes: 56 additions & 0 deletions tests/src/python/test_qgsdelimitedtextprovider.py
Expand Up @@ -907,6 +907,62 @@ def testSpatialIndex(self):
vl.dataProvider().createSpatialIndex()
self.assertEqual(vl.hasSpatialIndex(), QgsFeatureSource.SpatialIndexPresent)

def testCREndOfLineAndWorkingBuffer(self):
# Test CSV file with \r (CR) endings
# Test also that the logic to refill the buffer works properly
os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE'] = '17'
try:
basetestfile = os.path.join(unitTestDataPath("delimitedtext"), 'test_cr_end_of_line.csv')

url = MyUrl.fromLocalFile(basetestfile)
url.addQueryItem("type", "csv")
url.addQueryItem("geomType", "none")

vl = QgsVectorLayer(url.toString(), 'test', 'delimitedtext')
assert vl.isValid(), "{} is invalid".format(basetestfile)

fields = vl.fields()
self.assertEqual(len(fields), 2)
self.assertEqual(fields[0].name(), 'col0')
self.assertEqual(fields[1].name(), 'col1')

features = [f for f in vl.getFeatures()]
self.assertEqual(len(features), 2)
self.assertEqual(features[0]['col0'], 'value00')
self.assertEqual(features[0]['col1'], 'value01')
self.assertEqual(features[1]['col0'], 'value10')
self.assertEqual(features[1]['col1'], 'value11')

finally:
del os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE']

def testSaturationOfWorkingBuffer(self):
# 10 bytes is sufficient to detect the header line, but not enough for the
# first record
os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE'] = '10'
try:
basetestfile = os.path.join(unitTestDataPath("delimitedtext"), 'test_cr_end_of_line.csv')

url = MyUrl.fromLocalFile(basetestfile)
url.addQueryItem("type", "csv")
url.addQueryItem("geomType", "none")

vl = QgsVectorLayer(url.toString(), 'test', 'delimitedtext')
assert vl.isValid(), "{} is invalid".format(basetestfile)

fields = vl.fields()
self.assertEqual(len(fields), 2)
self.assertEqual(fields[0].name(), 'col0')
self.assertEqual(fields[1].name(), 'col1')

features = [f for f in vl.getFeatures()]
self.assertEqual(len(features), 1)
self.assertEqual(features[0]['col0'], 'value00')
self.assertEqual(features[0]['col1'], 'va') # truncated

finally:
del os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE']


if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions tests/testdata/delimitedtext/test_cr_end_of_line.csv
@@ -0,0 +1 @@
col0,col1value00,value01value10,value11
Expand Down

0 comments on commit c6b2ee0

Please sign in to comment.