From 5c4b2499eb1e49e35e3c3acca69fdddf7ef8a98d Mon Sep 17 00:00:00 2001
From: Bernie Hackett <bernie@10gen.com>
Date: Wed, 3 Sep 2014 15:05:14 -0700
Subject: [PATCH] PYTHON-346 - Use codecs for all string decoding/encoding.

---
 bson/__init__.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/bson/__init__.py b/bson/__init__.py
index 779ccf52b..bf4352108 100644
--- a/bson/__init__.py
+++ b/bson/__init__.py
@@ -24,7 +24,8 @@ import struct
 import sys
 import uuid
 
-from codecs import utf_8_decode as _utf_8_decode
+from codecs import (utf_8_decode as _utf_8_decode,
+                    utf_8_encode as _utf_8_encode)
 
 from bson.binary import (Binary, OLD_UUID_SUBTYPE,
                          JAVA_LEGACY, CSHARP_LEGACY,
@@ -99,7 +100,7 @@ def _get_int(data, position, dummy0, dummy1):
 def _get_c_string(data, position):
     """Decode a BSON 'C' string to python unicode string."""
     end = data.index(b"\x00", position)
-    return _utf_8_decode(data[position:end])[0], end + 1
+    return _utf_8_decode(data[position:end], None, True)[0], end + 1
 
 
 def _get_float(data, position, dummy0, dummy1):
@@ -117,7 +118,7 @@ def _get_string(data, position, obj_end, dummy):
     end = position + length - 1
     if data[end:end + 1] != b"\x00":
         raise InvalidBSON("invalid end of string")
-    return _utf_8_decode(data[position:end])[0], end + 1
+    return _utf_8_decode(data[position:end], None, True)[0], end + 1
 
 
 def _get_object(data, position, obj_end, opts):
@@ -357,7 +358,7 @@ def _make_c_string_check(string):
             raise InvalidDocument("BSON keys / regex patterns must not "
                                   "contain a NUL character")
         try:
-            string.decode("utf-8")
+            _utf_8_decode(string, None, True)
             return string + b"\x00"
         except UnicodeError:
             raise InvalidStringData("strings in documents must be valid "
@@ -366,20 +367,20 @@ def _make_c_string_check(string):
         if "\x00" in string:
             raise InvalidDocument("BSON keys / regex patterns must not "
                                   "contain a NUL character")
-        return string.encode("utf-8") + b"\x00"
+        return _utf_8_encode(string)[0] + b"\x00"
 
 
 def _make_c_string(string):
     """Make a 'C' string."""
     if isinstance(string, bytes):
         try:
-            string.decode("utf-8")
+            _utf_8_decode(string, None, True)
             return string + b"\x00"
         except UnicodeError:
             raise InvalidStringData("strings in documents must be valid "
                                     "UTF-8: %r" % string)
     else:
-        return string.encode("utf-8") + b"\x00"
+        return _utf_8_encode(string)[0] + b"\x00"
 
 
 if PY3:
@@ -389,7 +390,7 @@ if PY3:
         if "\x00" in string:
             raise InvalidDocument("BSON keys / regex patterns must not "
                                   "contain a NUL character")
-        return string.encode("utf-8") + b"\x00"
+        return _utf_8_encode(string)[0] + b"\x00"
 else:
     # Keys can be unicode or bytes in python 2.
     _make_name = _make_c_string_check
@@ -409,7 +410,7 @@ else:
     def _encode_bytes(name, value, dummy0, dummy1):
         """Encode a python str (python 2.x)."""
         try:
-            value.decode("utf-8")
+            _utf_8_decode(value, None, True)
         except UnicodeError:
             raise InvalidStringData("strings in documents must be valid "
                                     "UTF-8: %r" % (value,))
@@ -454,7 +455,7 @@ def _encode_list(name, value, check_keys, uuid_subtype):
 
 def _encode_text(name, value, dummy0, dummy1):
     """Encode a python unicode (python 2.x) / str (python 3.x)."""
-    value = value.encode("utf-8")
+    value = _utf_8_encode(value)[0]
     return b"\x02" + name + _PACK_INT(len(value) + 1) + value + b"\x00"