Fix utf8->utf16 BOM/ZWNBSP decoding.

When the byte sequence for a BOM occurs in the middle of a utf8 stream, it is a ZWNBSP. When a ZWNBSP occurs in the middle of a utf8 character sequence, and the SIMD conversion does some work (meaning: the length is at least 16 characters long), it would not recognize the fact some charactes were already decoded. So the conversion would then strip the ZWNBSP out, thinking it's a BOM. The non-SIMD conversion did not have this problem: the very first character conversion would already set the headerdone flag. Change-Id: I39aacf607e2e068107106254021a8042d164f628 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Erik Verbruggen <erik.verbruggen@theqtcompany.com> 2015-12-16 14:04:27 +0100
committer: Erik Verbruggen <erik.verbruggen@theqtcompany.com> 2015-12-21 09:56:49 +0000
commit: 1823c8f2ddd0a5c1b4301e7af7109796090a3c9a (patch)
tree: 7179725508e8ad7a71b5e73ad2e74744b15450d9
parent: Fix extract style on Android 6.0 (diff)
download: qtbase-1823c8f2ddd0a5c1b4301e7af7109796090a3c9a.tar.xz
qtbase-1823c8f2ddd0a5c1b4301e7af7109796090a3c9a.zip
2 files changed, 16 insertions, 6 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 98d4b2e4e3..a33c1bc9ce 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -364,6 +364,7 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
     // main body, stateless decoding
     res = 0;
     const uchar *nextAscii = src;
+    const uchar *start = src;
     while (res >= 0 && src < end) {
         if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
             break;
@@ -372,9 +373,11 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
         res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
         if (!headerdone && res >= 0) {
             headerdone = true;
-            // eat the UTF-8 BOM
-            if (dst[-1] == 0xfeff)
-                --dst;
+            if (src == start + 3) { // 3 == sizeof(utf8-bom)
+                // eat the UTF-8 BOM (it can only appear at the beginning of the string).
+                if (dst[-1] == 0xfeff)
+                    --dst;
+            }
         }
         if (res == QUtf8BaseTraits::Error) {
             res = 0;
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
index 3aa06d237d..8a9ae0cd72 100644
--- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
@@ -1588,10 +1588,17 @@ void tst_QTextCodec::utf8bom_data()
             << QString("a");
     }
 
-    {
+    { // test the non-SIMD code-path
         static const ushort data[] = { 0x61, 0xfeff, 0x62 };
-        QTest::newRow("middle-bom")
-            << QByteArray("a\357\273\277b", 5)
+        QTest::newRow("middle-bom (non SIMD)")
+            << QByteArray("a\357\273\277b")
+            << QString::fromUtf16(data, sizeof(data)/sizeof(short));
+    }
+
+    { // test the SIMD code-path
+        static const ushort data[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0xfeff, 0x6d };
+        QTest::newRow("middle-bom (SIMD)")
+            << QByteArray("abcdefghijkl\357\273\277m")
             << QString::fromUtf16(data, sizeof(data)/sizeof(short));
     }
 }
author	Erik Verbruggen <erik.verbruggen@theqtcompany.com>	2015-12-16 14:04:27 +0100
committer	Erik Verbruggen <erik.verbruggen@theqtcompany.com>	2015-12-21 09:56:49 +0000
commit	1823c8f2ddd0a5c1b4301e7af7109796090a3c9a (patch)
tree	7179725508e8ad7a71b5e73ad2e74744b15450d9
parent	Fix extract style on Android 6.0 (diff)
download	qtbase-1823c8f2ddd0a5c1b4301e7af7109796090a3c9a.tar.xz qtbase-1823c8f2ddd0a5c1b4301e7af7109796090a3c9a.zip