Improve the check for legal utf8 in the bson module.

Now python and bson have the same understanding of legal utf8.
This commit is contained in:
stephan-hof 2018-03-16 13:30:59 +01:00 committed by Bernie Hackett
parent cb34e31cb0
commit e113a338e9
2 changed files with 84 additions and 1 deletions

View File

@ -78,7 +78,7 @@ static unsigned char isLegalUTF8(const unsigned char* source, int length) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return 0; break;
case 0xF0: if (a < 0x90) return 0; break;
case 0xF4: if (a > 0x8F) return 0; break;
case 0xF4: if ((a > 0x8F) || (a < 0x80)) return 0; break;
default: if (a < 0x80) return 0;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return 0;

View File

@ -650,6 +650,89 @@ class TestBSON(unittest.TestCase):
z = {iso8859_bytes: "hello"}
self.assertRaises(InvalidStringData, BSON.encode, z)
# Verify that python and bson have the same understanding of
# legal utf-8 if the first byte is 0xf4 (244)
@staticmethod
def _py_is_legal_utf8(x):
try:
x.decode('utf-8')
return True
except UnicodeDecodeError:
return False
@staticmethod
def _bson_is_legal_utf8(x):
try:
BSON.encode({'x': x})
return True
except InvalidStringData:
return False
@unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode")
def test_legal_utf8_full_coverage(self):
# this tests takes 400 seconds. Which is too long to run each time.
# However it is the only one which covers all possible bit combinations
# in the 244 space.
b1 = chr(0xf4)
for b2 in map(chr, range(255)):
m2 = b1 + b2
self.assertEqual(
self._py_is_legal_utf8(m2),
self._bson_is_legal_utf8(m2)
)
for b3 in map(chr, range(255)):
m3 = m2 + b3
self.assertEqual(
self._py_is_legal_utf8(m3),
self._bson_is_legal_utf8(m3)
)
for b4 in map(chr, range(255)):
m4 = m3 + b4
self.assertEqual(
self._py_is_legal_utf8(m4),
self._bson_is_legal_utf8(m4)
)
# In python3:
# - 'bytes' are not checked with isLegalutf
# - 'unicode' I cannot create unicode objects with invalid utf8, since it
# would result in non valid code-points.
@unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode")
def test_legal_utf8_few_samples(self):
good_samples = [
'\xf4\x80\x80\x80',
'\xf4\x8a\x80\x80',
'\xf4\x8e\x80\x80',
'\xf4\x81\x80\x80',
]
for data in good_samples:
self.assertEqual(
self._py_is_legal_utf8(data),
self._bson_is_legal_utf8(data)
)
bad_samples = [
'\xf4\x00\x80\x80',
'\xf4\x3a\x80\x80',
'\xf4\x7f\x80\x80',
'\xf4\x90\x80\x80',
'\xf4\xff\x80\x80',
]
for data in bad_samples:
self.assertEqual(
self._py_is_legal_utf8(data),
self._bson_is_legal_utf8(data),
data
)
def test_null_character(self):
doc = {"a": "\x00"}
self.assertEqual(doc, BSON.encode(doc).decode())