Skip to content

Commit c6b2ee0

Browse files
rouaultnyalldawson
authored andcommittedJul 21, 2020
QgsDelimitedTextFile: fix parsing of files with CR end of line
Fixes #36392 Fixes #21976 Fixes #17190 We are obliged to do 'at hand' parsing due to QT not handling CR-only end of lines. As we are at it, also limit each line to 1 MB to avoid potential denial of service (which was what close to what happened here before the CR-only parsing fix) Add tests for parsing CR-only end of lines, and exercising the at-hand buffering logic (cherry picked from commit 644a564)
1 parent d09a4f2 commit c6b2ee0

File tree

4 files changed

+139
-6
lines changed

4 files changed

+139
-6
lines changed
 

‎src/providers/delimitedtext/qgsdelimitedtextfile.cpp

Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ QgsDelimitedTextFile::QgsDelimitedTextFile( const QString &url )
3939
// The default type is CSV
4040
setTypeCSV();
4141
if ( ! url.isNull() ) setFromUrl( url );
42+
43+
// For tests
44+
QString bufferSizeStr( getenv( "QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE" ) );
45+
mMaxBufferSize = bufferSizeStr.isEmpty() ? 1024 * 1024 : bufferSizeStr.toInt();
4246
}
4347

4448

@@ -530,7 +534,6 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextRecord( QStringList &reco
530534
return status;
531535
}
532536

533-
534537
QgsDelimitedTextFile::Status QgsDelimitedTextFile::reset()
535538
{
536539
// Make sure the file is valid open
@@ -541,12 +544,14 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::reset()
541544
mLineNumber = 0;
542545
mRecordNumber = -1;
543546
mRecordLineNumber = -1;
547+
mBuffer = QString();
548+
mPosInBuffer = 0;
544549

545550
// Skip header lines
546551
for ( int i = mSkipLines; i-- > 0; )
547552
{
548-
if ( mStream->readLine().isNull() ) return RecordEOF;
549-
mLineNumber++;
553+
QString ignoredContent;
554+
if ( nextLine( ignoredContent ) == RecordEOF ) return RecordEOF;
550555
}
551556
// Read the column names
552557
Status result = RecordOk;
@@ -567,11 +572,79 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextLine( QString &buffer, bo
567572
Status status = reset();
568573
if ( status != RecordOk ) return status;
569574
}
575+
if ( mLineNumber == 0 )
576+
{
577+
mPosInBuffer = 0;
578+
mBuffer = mStream->read( mMaxBufferSize );
579+
}
570580

571-
while ( ! mStream->atEnd() )
581+
while ( !mBuffer.isEmpty() )
572582
{
573-
buffer = mStream->readLine();
574-
if ( buffer.isNull() ) break;
583+
// Identify position of \r , \n or \r\n
584+
// We should rather use mStream->readLine(), but it fails to detect \r
585+
// line endings.
586+
int eolPos = mBuffer.indexOf( '\r', mPosInBuffer );
587+
int nextPos = 0;
588+
if ( eolPos >= 0 )
589+
{
590+
nextPos = eolPos + 1;
591+
// Check if there is a \n just afterwards
592+
if ( eolPos + 1 < mBuffer.size() )
593+
{
594+
if ( mBuffer[eolPos + 1] == '\n' )
595+
{
596+
nextPos = eolPos + 2;
597+
}
598+
}
599+
else
600+
{
601+
// If we are just at the end of the buffer, read an extra character
602+
// from the stream
603+
QString newChar = mStream->read( 1 );
604+
mBuffer += newChar;
605+
if ( newChar == '\n' )
606+
{
607+
nextPos = eolPos + 2;
608+
}
609+
}
610+
}
611+
else
612+
{
613+
eolPos = mBuffer.indexOf( '\n', mPosInBuffer );
614+
if ( eolPos >= 0 )
615+
{
616+
nextPos = eolPos + 1;
617+
}
618+
}
619+
if ( eolPos < 0 )
620+
{
621+
if ( mPosInBuffer == 0 )
622+
{
623+
// If our current position was the beginning of the buffer and we
624+
// didn't find any end of line character, then return the whole buffer
625+
// (to avoid unbounded line sizes)
626+
// and set the buffer to null so that we don't iterate any more.
627+
buffer = mBuffer;
628+
mBuffer = QString();
629+
}
630+
else
631+
{
632+
// Read more bytes from file to have up to mMaxBufferSize characters
633+
// in our buffer (after having subset it from mPosInBuffer)
634+
mBuffer = mBuffer.mid( mPosInBuffer );
635+
mBuffer += mStream->read( mMaxBufferSize - mBuffer.size() );
636+
mPosInBuffer = 0;
637+
continue;
638+
}
639+
}
640+
else
641+
{
642+
// Extract the current line from the buffer
643+
buffer = mBuffer.mid( mPosInBuffer, eolPos - mPosInBuffer );
644+
// Update current position in buffer to be the one next to the end of
645+
// line character(s)
646+
mPosInBuffer = nextPos;
647+
}
575648
mLineNumber++;
576649
if ( skipBlank && buffer.isEmpty() ) continue;
577650
return RecordOk;

‎src/providers/delimitedtext/qgsdelimitedtextfile.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,9 @@ class QgsDelimitedTextFile : public QObject
428428
long mLineNumber = -1;
429429
long mRecordLineNumber = -1;
430430
long mRecordNumber = -1;
431+
QString mBuffer;
432+
int mPosInBuffer = 0;
433+
int mMaxBufferSize = 0;
431434
QStringList mCurrentRecord;
432435
bool mHoldCurrentRecord = false;
433436
// Maximum number of record (ie maximum record number visited)

‎tests/src/python/test_qgsdelimitedtextprovider.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,62 @@ def testSpatialIndex(self):
907907
vl.dataProvider().createSpatialIndex()
908908
self.assertEqual(vl.hasSpatialIndex(), QgsFeatureSource.SpatialIndexPresent)
909909

910+
def testCREndOfLineAndWorkingBuffer(self):
911+
# Test CSV file with \r (CR) endings
912+
# Test also that the logic to refill the buffer works properly
913+
os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE'] = '17'
914+
try:
915+
basetestfile = os.path.join(unitTestDataPath("delimitedtext"), 'test_cr_end_of_line.csv')
916+
917+
url = MyUrl.fromLocalFile(basetestfile)
918+
url.addQueryItem("type", "csv")
919+
url.addQueryItem("geomType", "none")
920+
921+
vl = QgsVectorLayer(url.toString(), 'test', 'delimitedtext')
922+
assert vl.isValid(), "{} is invalid".format(basetestfile)
923+
924+
fields = vl.fields()
925+
self.assertEqual(len(fields), 2)
926+
self.assertEqual(fields[0].name(), 'col0')
927+
self.assertEqual(fields[1].name(), 'col1')
928+
929+
features = [f for f in vl.getFeatures()]
930+
self.assertEqual(len(features), 2)
931+
self.assertEqual(features[0]['col0'], 'value00')
932+
self.assertEqual(features[0]['col1'], 'value01')
933+
self.assertEqual(features[1]['col0'], 'value10')
934+
self.assertEqual(features[1]['col1'], 'value11')
935+
936+
finally:
937+
del os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE']
938+
939+
def testSaturationOfWorkingBuffer(self):
940+
# 10 bytes is sufficient to detect the header line, but not enough for the
941+
# first record
942+
os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE'] = '10'
943+
try:
944+
basetestfile = os.path.join(unitTestDataPath("delimitedtext"), 'test_cr_end_of_line.csv')
945+
946+
url = MyUrl.fromLocalFile(basetestfile)
947+
url.addQueryItem("type", "csv")
948+
url.addQueryItem("geomType", "none")
949+
950+
vl = QgsVectorLayer(url.toString(), 'test', 'delimitedtext')
951+
assert vl.isValid(), "{} is invalid".format(basetestfile)
952+
953+
fields = vl.fields()
954+
self.assertEqual(len(fields), 2)
955+
self.assertEqual(fields[0].name(), 'col0')
956+
self.assertEqual(fields[1].name(), 'col1')
957+
958+
features = [f for f in vl.getFeatures()]
959+
self.assertEqual(len(features), 1)
960+
self.assertEqual(features[0]['col0'], 'value00')
961+
self.assertEqual(features[0]['col1'], 'va') # truncated
962+
963+
finally:
964+
del os.environ['QGIS_DELIMITED_TEXT_FILE_BUFFER_SIZE']
965+
910966

911967
if __name__ == '__main__':
912968
unittest.main()

‎tests/testdata/delimitedtext/test_cr_end_of_line.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
col0,col1value00,value01value10,value11

0 commit comments

Comments
 (0)
Please sign in to comment.