Skip to content

Commit

Permalink
[DelimitedText provider] Fix performance issue with files with Unix e…
Browse files Browse the repository at this point in the history
…nd-of-line characters

Fixes #38068

The performance regression was introduced per commit 644a564
It resulted in quadratic performance in the size of the buffer for CSV files
with Unix '\n' end of line character, due to scanning repeatedly for `\r`

(cherry picked from commit a667844)
  • Loading branch information
rouault authored and nyalldawson committed Oct 23, 2020
1 parent d217117 commit 41185f7
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 29 deletions.
73 changes: 44 additions & 29 deletions src/providers/delimitedtext/qgsdelimitedtextfile.cpp
Expand Up @@ -583,40 +583,63 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextLine( QString &buffer, bo
// Identify position of \r , \n or \r\n
// We should rather use mStream->readLine(), but it fails to detect \r
// line endings.
int eolPos = mBuffer.indexOf( '\r', mPosInBuffer );
int nextPos = 0;
if ( eolPos >= 0 )
int eolPos = -1;
{
nextPos = eolPos + 1;
// Check if there is a \n just afterwards
if ( eolPos + 1 < mBuffer.size() )
if ( mLineNumber == 0 )
{
if ( mBuffer[eolPos + 1] == '\n' )
// For the first line we don't know yet the end of line character, so
// manually scan for the first we find
const QChar *charBuffer = mBuffer.constData();
const int bufferSize = mBuffer.size();
for ( int pos = mPosInBuffer; pos < bufferSize; ++pos )
{
nextPos = eolPos + 2;
if ( charBuffer[pos] == '\r' || charBuffer[pos] == '\n' )
{
mFirstEOLChar = charBuffer[pos];
eolPos = pos;
break;
}
}
}
else
{
// If we are just at the end of the buffer, read an extra character
// from the stream
QString newChar = mStream->read( 1 );
mBuffer += newChar;
if ( newChar == '\n' )
{
nextPos = eolPos + 2;
}
// Once we know the end of line character, use optimized indexOf()
eolPos = mBuffer.indexOf( mFirstEOLChar, mPosInBuffer );
}
}
else
if ( eolPos >= 0 )
{
eolPos = mBuffer.indexOf( '\n', mPosInBuffer );
if ( eolPos >= 0 )
int nextPos = eolPos + 1;
if ( mBuffer[eolPos] == '\r' )
{
nextPos = eolPos + 1;
// Check if there is a \n just afterwards
if ( eolPos + 1 < mBuffer.size() )
{
if ( mBuffer[eolPos + 1] == '\n' )
{
nextPos = eolPos + 2;
}
}
else
{
// If we are just at the end of the buffer, read an extra character
// from the stream
QString newChar = mStream->read( 1 );
mBuffer += newChar;
if ( newChar == '\n' )
{
nextPos = eolPos + 2;
}
}
}

// Extract the current line from the buffer
buffer = mBuffer.mid( mPosInBuffer, eolPos - mPosInBuffer );
// Update current position in buffer to be the one next to the end of
// line character(s)
mPosInBuffer = nextPos;
}
if ( eolPos < 0 )
else
{
if ( mPosInBuffer == 0 )
{
Expand All @@ -637,14 +660,6 @@ QgsDelimitedTextFile::Status QgsDelimitedTextFile::nextLine( QString &buffer, bo
continue;
}
}
else
{
// Extract the current line from the buffer
buffer = mBuffer.mid( mPosInBuffer, eolPos - mPosInBuffer );
// Update current position in buffer to be the one next to the end of
// line character(s)
mPosInBuffer = nextPos;
}
mLineNumber++;
if ( skipBlank && buffer.isEmpty() ) continue;
return RecordOk;
Expand Down
1 change: 1 addition & 0 deletions src/providers/delimitedtext/qgsdelimitedtextfile.h
Expand Up @@ -431,6 +431,7 @@ class QgsDelimitedTextFile : public QObject
QString mBuffer;
int mPosInBuffer = 0;
int mMaxBufferSize = 0;
QChar mFirstEOLChar = 0; // '\r' if EOL is "\r" or "\r\n", or `\n' if EOL is "\n"
QStringList mCurrentRecord;
bool mHoldCurrentRecord = false;
// Maximum number of record (ie maximum record number visited)
Expand Down

0 comments on commit 41185f7

Please sign in to comment.