From 8573099c4be03e134e22d61eb3b5e2c65dfc78ae Mon Sep 17 00:00:00 2001 From: Bernie Hackett Date: Sun, 24 Jun 2018 20:11:35 -0700 Subject: [PATCH] Move time consuming utf8 tests out of the main suite --- test/test_bson.py | 63 -------------------------------- test/unicode/test_utf8.py | 76 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 63 deletions(-) create mode 100644 test/unicode/test_utf8.py diff --git a/test/test_bson.py b/test/test_bson.py index e4b6a7491..b5291c5ad 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -650,69 +650,6 @@ class TestBSON(unittest.TestCase): z = {iso8859_bytes: "hello"} self.assertRaises(InvalidStringData, BSON.encode, z) - # Verify that python and bson have the same understanding of - # legal utf-8 if the first byte is 0xf4 (244) - def _assert_same_utf8_validation(self, data): - try: - data.decode('utf-8') - py_is_legal = True - except UnicodeDecodeError: - py_is_legal = False - - try: - BSON.encode({'x': data}) - bson_is_legal = True - except InvalidStringData: - bson_is_legal = False - - self.assertEqual(py_is_legal, bson_is_legal, data) - - @unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode") - def test_legal_utf8_full_coverage(self): - # this tests takes 400 seconds. Which is too long to run each time. - # However it is the only one which covers all possible bit combinations - # in the 244 space. - b1 = chr(0xf4) - - for b2 in map(chr, range(255)): - m2 = b1 + b2 - self._assert_same_utf8_validation(m2) - - for b3 in map(chr, range(255)): - m3 = m2 + b3 - self._assert_same_utf8_validation(m3) - - for b4 in map(chr, range(255)): - m4 = m3 + b4 - self._assert_same_utf8_validation(m4) - - # In python3: - # - 'bytes' are not checked with isLegalutf - # - 'unicode' I cannot create unicode objects with invalid utf8, since it - # would result in non valid code-points. - @unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode") - def test_legal_utf8_few_samples(self): - good_samples = [ - '\xf4\x80\x80\x80', - '\xf4\x8a\x80\x80', - '\xf4\x8e\x80\x80', - '\xf4\x81\x80\x80', - ] - - for data in good_samples: - self._assert_same_utf8_validation(data) - - bad_samples = [ - '\xf4\x00\x80\x80', - '\xf4\x3a\x80\x80', - '\xf4\x7f\x80\x80', - '\xf4\x90\x80\x80', - '\xf4\xff\x80\x80', - ] - - for data in bad_samples: - self._assert_same_utf8_validation(data) - def test_null_character(self): doc = {"a": "\x00"} self.assertEqual(doc, BSON.encode(doc).decode()) diff --git a/test/unicode/test_utf8.py b/test/unicode/test_utf8.py new file mode 100644 index 000000000..d66515d0a --- /dev/null +++ b/test/unicode/test_utf8.py @@ -0,0 +1,76 @@ +import sys + +sys.path[0:0] = [""] + +from bson import BSON +from bson.errors import InvalidStringData +from bson.py3compat import PY3 +from test import unittest + +class TestUTF8(unittest.TestCase): + + # Verify that python and bson have the same understanding of + # legal utf-8 if the first byte is 0xf4 (244) + def _assert_same_utf8_validation(self, data): + try: + data.decode('utf-8') + py_is_legal = True + except UnicodeDecodeError: + py_is_legal = False + + try: + BSON.encode({'x': data}) + bson_is_legal = True + except InvalidStringData: + bson_is_legal = False + + self.assertEqual(py_is_legal, bson_is_legal, data) + + @unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode") + def test_legal_utf8_full_coverage(self): + # This test takes 400 seconds. Which is too long to run each time. + # However it is the only one which covers all possible bit combinations + # in the 244 space. + b1 = chr(0xf4) + + for b2 in map(chr, range(255)): + m2 = b1 + b2 + self._assert_same_utf8_validation(m2) + + for b3 in map(chr, range(255)): + m3 = m2 + b3 + self._assert_same_utf8_validation(m3) + + for b4 in map(chr, range(255)): + m4 = m3 + b4 + self._assert_same_utf8_validation(m4) + + # In python3: + # - 'bytes' are not checked with isLegalutf + # - 'unicode' We cannot create unicode objects with invalid utf8, since it + # would result in non valid code-points. + @unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode") + def test_legal_utf8_few_samples(self): + good_samples = [ + '\xf4\x80\x80\x80', + '\xf4\x8a\x80\x80', + '\xf4\x8e\x80\x80', + '\xf4\x81\x80\x80', + ] + + for data in good_samples: + self._assert_same_utf8_validation(data) + + bad_samples = [ + '\xf4\x00\x80\x80', + '\xf4\x3a\x80\x80', + '\xf4\x7f\x80\x80', + '\xf4\x90\x80\x80', + '\xf4\xff\x80\x80', + ] + + for data in bad_samples: + self._assert_same_utf8_validation(data) + +if __name__ == "__main__": + unittest.main()