PYTHON-3823 Audit benchmark data_size and calculate dynamically it where possible (#1439)

2023-11-17 12:07:33 -08:00 · 2023-11-17 12:07:33 -08:00 · ec35f7f76e
commit ec35f7f76e
parent fc220532df
1 changed files with 51 additions and 93 deletions
--- a/test/performance/perf_test.py
+++ b/test/performance/perf_test.py
@ -21,7 +21,7 @@ import sys
 import tempfile
 import time
 import warnings
-from typing import Any, List
+from typing import Any, List, Optional

 try:
    import simplejson as json
@ -70,9 +70,8 @@ class Timer:


 class PerformanceTest:
-    dataset: Any
-    data_size: Any
-    do_task: Any
+    dataset: str
+    data_size: int
    fail: Any

    @classmethod
@ -87,7 +86,9 @@ class PerformanceTest:
        name = self.__class__.__name__[4:]
        median = self.percentile(50)
        megabytes_per_sec = self.data_size / median / 1000000
-        print(f"Running {self.__class__.__name__}. MEDIAN={self.percentile(50)}")
+        print(
+            f"Running {self.__class__.__name__}. MB/s={megabytes_per_sec}, MEDIAN={self.percentile(50)}"
+        )
        result_data.append(
            {
                "info": {
@ -105,6 +106,9 @@ class PerformanceTest:
    def before(self):
        pass

+    def do_task(self):
+        raise NotImplementedError
+
    def after(self):
        pass

@ -120,12 +124,13 @@ class PerformanceTest:
    def runTest(self):
        results = []
        start = time.monotonic()
-        self.max_iterations = NUM_ITERATIONS
        for i in range(NUM_ITERATIONS):
            if time.monotonic() - start > MAX_ITERATION_TIME:
                with warnings.catch_warnings():
                    warnings.simplefilter("default")
-                    warnings.warn("Test timed out, completed %s iterations." % i)
+                    warnings.warn(
+                        f"Test timed out after {MAX_ITERATION_TIME}s, completed {i}/{NUM_ITERATIONS} iterations."
+                    )
                break
            self.before()
            with Timer() as timer:
@ -142,6 +147,7 @@ class BsonEncodingTest(PerformanceTest):
        # Location of test data.
        with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
            self.document = loads(data.read())
+        self.data_size = len(encode(self.document)) * NUM_DOCS

    def do_task(self):
        for _ in range(NUM_DOCS):
@ -154,6 +160,8 @@ class BsonDecodingTest(PerformanceTest):
        with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
            self.document = encode(json.loads(data.read()))

+        self.data_size = len(self.document) * NUM_DOCS
+
    def do_task(self):
        for _ in range(NUM_DOCS):
            decode(self.document)
@ -161,37 +169,31 @@ class BsonDecodingTest(PerformanceTest):

 class TestFlatEncoding(BsonEncodingTest, unittest.TestCase):
    dataset = "flat_bson.json"
-    data_size = 75310000


 class TestFlatDecoding(BsonDecodingTest, unittest.TestCase):
    dataset = "flat_bson.json"
-    data_size = 75310000


 class TestDeepEncoding(BsonEncodingTest, unittest.TestCase):
    dataset = "deep_bson.json"
-    data_size = 19640000


 class TestDeepDecoding(BsonDecodingTest, unittest.TestCase):
    dataset = "deep_bson.json"
-    data_size = 19640000


 class TestFullEncoding(BsonEncodingTest, unittest.TestCase):
    dataset = "full_bson.json"
-    data_size = 57340000


 class TestFullDecoding(BsonDecodingTest, unittest.TestCase):
    dataset = "full_bson.json"
-    data_size = 57340000


 # SINGLE-DOC BENCHMARKS
 class TestRunCommand(PerformanceTest, unittest.TestCase):
-    data_size = 160000
+    data_size = len(encode({"hello": True})) * NUM_DOCS

    def setUp(self):
        self.client = client_context.client
@ -200,7 +202,7 @@ class TestRunCommand(PerformanceTest, unittest.TestCase):
    def do_task(self):
        command = self.client.perftest.command
        for _ in range(NUM_DOCS):
-            command("ping")
+            command("hello", True)


 class TestDocument(PerformanceTest):
@ -225,23 +227,17 @@ class TestDocument(PerformanceTest):
        self.client.perftest.drop_collection("corpus")


-class TestFindOneByID(TestDocument, unittest.TestCase):
-    data_size = 16220000
+class FindTest(TestDocument):
+    dataset = "tweet.json"

    def setUp(self):
-        self.dataset = "tweet.json"
        super().setUp()
-
+        self.data_size = len(encode(self.document)) * NUM_DOCS
        documents = [self.document.copy() for _ in range(NUM_DOCS)]
        self.corpus = self.client.perftest.corpus
        result = self.corpus.insert_many(documents)
        self.inserted_ids = result.inserted_ids

-    def do_task(self):
-        find_one = self.corpus.find_one
-        for _id in self.inserted_ids:
-            find_one({"_id": _id})
-
    def before(self):
        pass

@ -249,30 +245,40 @@ class TestFindOneByID(TestDocument, unittest.TestCase):
        pass


-class TestSmallDocInsertOne(TestDocument, unittest.TestCase):
-    data_size = 2750000
+class TestFindOneByID(FindTest, unittest.TestCase):
+    def do_task(self):
+        find_one = self.corpus.find_one
+        for _id in self.inserted_ids:
+            find_one({"_id": _id})
+
+
+class SmallDocInsertTest(TestDocument):
+    dataset = "small_doc.json"

    def setUp(self):
-        self.dataset = "small_doc.json"
        super().setUp()
-
+        self.data_size = len(encode(self.document)) * NUM_DOCS
        self.documents = [self.document.copy() for _ in range(NUM_DOCS)]

+
+class TestSmallDocInsertOne(SmallDocInsertTest, unittest.TestCase):
    def do_task(self):
        insert_one = self.corpus.insert_one
        for doc in self.documents:
            insert_one(doc)


-class TestLargeDocInsertOne(TestDocument, unittest.TestCase):
-    data_size = 27310890
+class LargeDocInsertTest(TestDocument):
+    dataset = "large_doc.json"

    def setUp(self):
-        self.dataset = "large_doc.json"
        super().setUp()
+        n_docs = 10
+        self.data_size = len(encode(self.document)) * n_docs
+        self.documents = [self.document.copy() for _ in range(n_docs)]

-        self.documents = [self.document.copy() for _ in range(10)]

+class TestLargeDocInsertOne(LargeDocInsertTest, unittest.TestCase):
    def do_task(self):
        insert_one = self.corpus.insert_one
        for doc in self.documents:
@ -280,61 +286,24 @@ class TestLargeDocInsertOne(TestDocument, unittest.TestCase):


 # MULTI-DOC BENCHMARKS
-class TestFindManyAndEmptyCursor(TestDocument, unittest.TestCase):
-    data_size = 16220000
-
-    def setUp(self):
-        self.dataset = "tweet.json"
-        super().setUp()
-
-        for _ in range(10):
-            self.client.perftest.command("insert", "corpus", documents=[self.document] * 1000)
-        self.corpus = self.client.perftest.corpus
-
+class TestFindManyAndEmptyCursor(FindTest, unittest.TestCase):
    def do_task(self):
        list(self.corpus.find())

-    def before(self):
-        pass
-
-    def after(self):
-        pass
-
-
-class TestSmallDocBulkInsert(TestDocument, unittest.TestCase):
-    data_size = 2750000
-
-    def setUp(self):
-        self.dataset = "small_doc.json"
-        super().setUp()
-        self.documents = [self.document.copy() for _ in range(NUM_DOCS)]
-
-    def before(self):
-        self.corpus = self.client.perftest.create_collection("corpus")

+class TestSmallDocBulkInsert(SmallDocInsertTest, unittest.TestCase):
    def do_task(self):
        self.corpus.insert_many(self.documents, ordered=True)


-class TestLargeDocBulkInsert(TestDocument, unittest.TestCase):
-    data_size = 27310890
-
-    def setUp(self):
-        self.dataset = "large_doc.json"
-        super().setUp()
-        self.documents = [self.document.copy() for _ in range(10)]
-
-    def before(self):
-        self.corpus = self.client.perftest.create_collection("corpus")
-
+class TestLargeDocBulkInsert(LargeDocInsertTest, unittest.TestCase):
    def do_task(self):
        self.corpus.insert_many(self.documents, ordered=True)


-class TestGridFsUpload(PerformanceTest, unittest.TestCase):
-    data_size = 52428800
-
+class GridFsTest(PerformanceTest):
    def setUp(self):
+        super().setUp()
        self.client = client_context.client
        self.client.drop_database("perftest")

@ -343,44 +312,33 @@ class TestGridFsUpload(PerformanceTest, unittest.TestCase):
        )
        with open(gridfs_path, "rb") as data:
            self.document = data.read()
-
+        self.data_size = len(self.document)
        self.bucket = GridFSBucket(self.client.perftest)

    def tearDown(self):
        super().tearDown()
        self.client.drop_database("perftest")

+
+class TestGridFsUpload(GridFsTest, unittest.TestCase):
    def before(self):
+        # Create the bucket.
        self.bucket.upload_from_stream("init", b"x")

    def do_task(self):
        self.bucket.upload_from_stream("gridfstest", self.document)


-class TestGridFsDownload(PerformanceTest, unittest.TestCase):
-    data_size = 52428800
-
+class TestGridFsDownload(GridFsTest, unittest.TestCase):
    def setUp(self):
-        self.client = client_context.client
-        self.client.drop_database("perftest")
-
-        gridfs_path = os.path.join(
-            TEST_PATH, os.path.join("single_and_multi_document", "gridfs_large.bin")
-        )
-
-        self.bucket = GridFSBucket(self.client.perftest)
-        with open(gridfs_path, "rb") as gfile:
-            self.uploaded_id = self.bucket.upload_from_stream("gridfstest", gfile)
-
-    def tearDown(self):
-        super().tearDown()
-        self.client.drop_database("perftest")
+        super().setUp()
+        self.uploaded_id = self.bucket.upload_from_stream("gridfstest", self.document)

    def do_task(self):
        self.bucket.open_download_stream(self.uploaded_id).read()


-proc_client = None
+proc_client: Optional[MongoClient] = None


 def proc_init(*dummy):