Skip to content

Commit ade5b65

Browse files
committedOct 3, 2014
Workaround issue in Qt detection of encoding for html QByteArrays
1 parent 2cd1770 commit ade5b65

File tree

4 files changed

+77
-2
lines changed

4 files changed

+77
-2
lines changed
 

‎src/core/qgsnetworkcontentfetcher.cpp

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,47 @@ QString QgsNetworkContentFetcher::contentAsString() const
6767
QByteArray array = mReply->readAll();
6868

6969
//correctly encode reply as unicode
70-
QString content = QTextCodec::codecForHtml( array )->toUnicode( array );
71-
return content;
70+
QTextCodec* codec = codecForHtml( array );
71+
return codec->toUnicode( array );
72+
}
73+
74+
QTextCodec* QgsNetworkContentFetcher::codecForHtml( QByteArray& array ) const
75+
{
76+
//QTextCodec::codecForHtml fails to detect "<meta charset="utf-8"/>" type tags
77+
//see https://bugreports.qt-project.org/browse/QTBUG-41011
78+
//so test for that ourselves
79+
80+
//basic check
81+
QTextCodec* codec = QTextCodec::codecForUtfText( array, 0 );
82+
if ( codec )
83+
{
84+
return codec;
85+
}
86+
87+
//check for meta charset tag
88+
QByteArray header = array.left( 1024 ).toLower();
89+
int pos = header.indexOf( "meta charset=" );
90+
if ( pos != -1 )
91+
{
92+
pos += int( strlen( "meta charset=" ) ) + 1;
93+
int pos2 = header.indexOf( '\"', pos );
94+
QByteArray cs = header.mid( pos, pos2 - pos );
95+
codec = QTextCodec::codecForName( cs );
96+
if ( codec )
97+
{
98+
return codec;
99+
}
100+
}
101+
102+
//fallback to QTextCodec::codecForHtml
103+
codec = QTextCodec::codecForHtml( array, codec );
104+
if ( codec )
105+
{
106+
return codec;
107+
}
108+
109+
//no luck, default to utf-8
110+
return QTextCodec::codecForName( "UTF-8" );
72111
}
73112

74113
void QgsNetworkContentFetcher::contentLoaded( bool ok )

‎src/core/qgsnetworkcontentfetcher.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,12 @@ class CORE_EXPORT QgsNetworkContentFetcher : public QObject
6969

7070
bool mContentLoaded;
7171

72+
/**Tries to create a text codec for decoding html content. Works around bugs in Qt's built in method.
73+
* @param array input html byte array
74+
* @returns QTextCodec for html content, if detected
75+
*/
76+
QTextCodec *codecForHtml( QByteArray &array ) const;
77+
7278
private slots:
7379

7480
/**Called when fetchUrlContent has finished loading a url. If

‎tests/src/core/testqgsnetworkcontentfetcher.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class TestQgsNetworkContentFetcher: public QObject
3131
void fetchEmptyUrl(); //test fetching blank url
3232
void fetchBadUrl(); //test fetching bad url
3333
void fetchUrlContent(); //test fetching url content
34+
void fetchEncodedContent(); //test fetching url content encoded as utf-8
3435

3536
void contentLoaded();
3637

@@ -107,6 +108,24 @@ void TestQgsNetworkContentFetcher::fetchUrlContent()
107108
QVERIFY( mFetchedHtml.contains( QString( "QGIS" ) ) );
108109
}
109110

111+
void TestQgsNetworkContentFetcher::fetchEncodedContent()
112+
{
113+
QgsNetworkContentFetcher fetcher;
114+
//test fetching content from the QGIS homepage
115+
mLoaded = false;
116+
fetcher.fetchContent( QUrl::fromLocalFile( QString( TEST_DATA_DIR ) + QDir::separator() + "encoded_html.html" ) );
117+
connect( &fetcher, SIGNAL( finished() ), this, SLOT( contentLoaded() ) );
118+
while ( !mLoaded )
119+
{
120+
qApp->processEvents();
121+
}
122+
QVERIFY( fetcher.reply()->error() == QNetworkReply::NoError );
123+
124+
//test retrieved content and check for correct detection of encoding
125+
QString mFetchedHtml = fetcher.contentAsString();
126+
QVERIFY( mFetchedHtml.contains( QChar( 6040 ) ) );
127+
}
128+
110129
void TestQgsNetworkContentFetcher::contentLoaded()
111130
{
112131
mLoaded = true;

‎tests/testdata/encoded_html.html

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<!doctype html>
2+
<html>
3+
<head>
4+
<meta http-equiv="Content-Type" content="text/html;">
5+
<meta charset="UTF-8">
6+
<title>test</title>
7+
</head>
8+
<body>
9+
<p>សាលា ម៉ាត</p>
10+
</body></html>
11+

0 commit comments

Comments
 (0)
Please sign in to comment.