From ec35f7f76e5b7a7f4039de37e0f51bdbcc8c2fdc Mon Sep 17 00:00:00 2001
From: Shane Harvey <shnhrv@gmail.com>
Date: Fri, 17 Nov 2023 12:07:33 -0800
Subject: [PATCH] PYTHON-3823 Audit benchmark data_size and calculate
 dynamically it where possible (#1439)

---
 test/performance/perf_test.py | 144 ++++++++++++----------------------
 1 file changed, 51 insertions(+), 93 deletions(-)

diff --git a/test/performance/perf_test.py b/test/performance/perf_test.py
index 6aabb595e..ec3fb0bd4 100644
--- a/test/performance/perf_test.py
+++ b/test/performance/perf_test.py
@@ -21,7 +21,7 @@ import sys
 import tempfile
 import time
 import warnings
-from typing import Any, List
+from typing import Any, List, Optional
 
 try:
     import simplejson as json
@@ -70,9 +70,8 @@ class Timer:
 
 
 class PerformanceTest:
-    dataset: Any
-    data_size: Any
-    do_task: Any
+    dataset: str
+    data_size: int
     fail: Any
 
     @classmethod
@@ -87,7 +86,9 @@ class PerformanceTest:
         name = self.__class__.__name__[4:]
         median = self.percentile(50)
         megabytes_per_sec = self.data_size / median / 1000000
-        print(f"Running {self.__class__.__name__}. MEDIAN={self.percentile(50)}")
+        print(
+            f"Running {self.__class__.__name__}. MB/s={megabytes_per_sec}, MEDIAN={self.percentile(50)}"
+        )
         result_data.append(
             {
                 "info": {
@@ -105,6 +106,9 @@ class PerformanceTest:
     def before(self):
         pass
 
+    def do_task(self):
+        raise NotImplementedError
+
     def after(self):
         pass
 
@@ -120,12 +124,13 @@ class PerformanceTest:
     def runTest(self):
         results = []
         start = time.monotonic()
-        self.max_iterations = NUM_ITERATIONS
         for i in range(NUM_ITERATIONS):
             if time.monotonic() - start > MAX_ITERATION_TIME:
                 with warnings.catch_warnings():
                     warnings.simplefilter("default")
-                    warnings.warn("Test timed out, completed %s iterations." % i)
+                    warnings.warn(
+                        f"Test timed out after {MAX_ITERATION_TIME}s, completed {i}/{NUM_ITERATIONS} iterations."
+                    )
                 break
             self.before()
             with Timer() as timer:
@@ -142,6 +147,7 @@ class BsonEncodingTest(PerformanceTest):
         # Location of test data.
         with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
             self.document = loads(data.read())
+        self.data_size = len(encode(self.document)) * NUM_DOCS
 
     def do_task(self):
         for _ in range(NUM_DOCS):
@@ -154,6 +160,8 @@ class BsonDecodingTest(PerformanceTest):
         with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
             self.document = encode(json.loads(data.read()))
 
+        self.data_size = len(self.document) * NUM_DOCS
+
     def do_task(self):
         for _ in range(NUM_DOCS):
             decode(self.document)
@@ -161,37 +169,31 @@ class BsonDecodingTest(PerformanceTest):
 
 class TestFlatEncoding(BsonEncodingTest, unittest.TestCase):
     dataset = "flat_bson.json"
-    data_size = 75310000
 
 
 class TestFlatDecoding(BsonDecodingTest, unittest.TestCase):
     dataset = "flat_bson.json"
-    data_size = 75310000
 
 
 class TestDeepEncoding(BsonEncodingTest, unittest.TestCase):
     dataset = "deep_bson.json"
-    data_size = 19640000
 
 
 class TestDeepDecoding(BsonDecodingTest, unittest.TestCase):
     dataset = "deep_bson.json"
-    data_size = 19640000
 
 
 class TestFullEncoding(BsonEncodingTest, unittest.TestCase):
     dataset = "full_bson.json"
-    data_size = 57340000
 
 
 class TestFullDecoding(BsonDecodingTest, unittest.TestCase):
     dataset = "full_bson.json"
-    data_size = 57340000
 
 
 # SINGLE-DOC BENCHMARKS
 class TestRunCommand(PerformanceTest, unittest.TestCase):
-    data_size = 160000
+    data_size = len(encode({"hello": True})) * NUM_DOCS
 
     def setUp(self):
         self.client = client_context.client
@@ -200,7 +202,7 @@ class TestRunCommand(PerformanceTest, unittest.TestCase):
     def do_task(self):
         command = self.client.perftest.command
         for _ in range(NUM_DOCS):
-            command("ping")
+            command("hello", True)
 
 
 class TestDocument(PerformanceTest):
@@ -225,23 +227,17 @@ class TestDocument(PerformanceTest):
         self.client.perftest.drop_collection("corpus")
 
 
-class TestFindOneByID(TestDocument, unittest.TestCase):
-    data_size = 16220000
+class FindTest(TestDocument):
+    dataset = "tweet.json"
 
     def setUp(self):
-        self.dataset = "tweet.json"
         super().setUp()
-
+        self.data_size = len(encode(self.document)) * NUM_DOCS
         documents = [self.document.copy() for _ in range(NUM_DOCS)]
         self.corpus = self.client.perftest.corpus
         result = self.corpus.insert_many(documents)
         self.inserted_ids = result.inserted_ids
 
-    def do_task(self):
-        find_one = self.corpus.find_one
-        for _id in self.inserted_ids:
-            find_one({"_id": _id})
-
     def before(self):
         pass
 
@@ -249,30 +245,40 @@ class TestFindOneByID(TestDocument, unittest.TestCase):
         pass
 
 
-class TestSmallDocInsertOne(TestDocument, unittest.TestCase):
-    data_size = 2750000
+class TestFindOneByID(FindTest, unittest.TestCase):
+    def do_task(self):
+        find_one = self.corpus.find_one
+        for _id in self.inserted_ids:
+            find_one({"_id": _id})
+
+
+class SmallDocInsertTest(TestDocument):
+    dataset = "small_doc.json"
 
     def setUp(self):
-        self.dataset = "small_doc.json"
         super().setUp()
-
+        self.data_size = len(encode(self.document)) * NUM_DOCS
         self.documents = [self.document.copy() for _ in range(NUM_DOCS)]
 
+
+class TestSmallDocInsertOne(SmallDocInsertTest, unittest.TestCase):
     def do_task(self):
         insert_one = self.corpus.insert_one
         for doc in self.documents:
             insert_one(doc)
 
 
-class TestLargeDocInsertOne(TestDocument, unittest.TestCase):
-    data_size = 27310890
+class LargeDocInsertTest(TestDocument):
+    dataset = "large_doc.json"
 
     def setUp(self):
-        self.dataset = "large_doc.json"
         super().setUp()
+        n_docs = 10
+        self.data_size = len(encode(self.document)) * n_docs
+        self.documents = [self.document.copy() for _ in range(n_docs)]
 
-        self.documents = [self.document.copy() for _ in range(10)]
 
+class TestLargeDocInsertOne(LargeDocInsertTest, unittest.TestCase):
     def do_task(self):
         insert_one = self.corpus.insert_one
         for doc in self.documents:
@@ -280,61 +286,24 @@ class TestLargeDocInsertOne(TestDocument, unittest.TestCase):
 
 
 # MULTI-DOC BENCHMARKS
-class TestFindManyAndEmptyCursor(TestDocument, unittest.TestCase):
-    data_size = 16220000
-
-    def setUp(self):
-        self.dataset = "tweet.json"
-        super().setUp()
-
-        for _ in range(10):
-            self.client.perftest.command("insert", "corpus", documents=[self.document] * 1000)
-        self.corpus = self.client.perftest.corpus
-
+class TestFindManyAndEmptyCursor(FindTest, unittest.TestCase):
     def do_task(self):
         list(self.corpus.find())
 
-    def before(self):
-        pass
-
-    def after(self):
-        pass
-
-
-class TestSmallDocBulkInsert(TestDocument, unittest.TestCase):
-    data_size = 2750000
-
-    def setUp(self):
-        self.dataset = "small_doc.json"
-        super().setUp()
-        self.documents = [self.document.copy() for _ in range(NUM_DOCS)]
-
-    def before(self):
-        self.corpus = self.client.perftest.create_collection("corpus")
 
+class TestSmallDocBulkInsert(SmallDocInsertTest, unittest.TestCase):
     def do_task(self):
         self.corpus.insert_many(self.documents, ordered=True)
 
 
-class TestLargeDocBulkInsert(TestDocument, unittest.TestCase):
-    data_size = 27310890
-
-    def setUp(self):
-        self.dataset = "large_doc.json"
-        super().setUp()
-        self.documents = [self.document.copy() for _ in range(10)]
-
-    def before(self):
-        self.corpus = self.client.perftest.create_collection("corpus")
-
+class TestLargeDocBulkInsert(LargeDocInsertTest, unittest.TestCase):
     def do_task(self):
         self.corpus.insert_many(self.documents, ordered=True)
 
 
-class TestGridFsUpload(PerformanceTest, unittest.TestCase):
-    data_size = 52428800
-
+class GridFsTest(PerformanceTest):
     def setUp(self):
+        super().setUp()
         self.client = client_context.client
         self.client.drop_database("perftest")
 
@@ -343,44 +312,33 @@ class TestGridFsUpload(PerformanceTest, unittest.TestCase):
         )
         with open(gridfs_path, "rb") as data:
             self.document = data.read()
-
+        self.data_size = len(self.document)
         self.bucket = GridFSBucket(self.client.perftest)
 
     def tearDown(self):
         super().tearDown()
         self.client.drop_database("perftest")
 
+
+class TestGridFsUpload(GridFsTest, unittest.TestCase):
     def before(self):
+        # Create the bucket.
         self.bucket.upload_from_stream("init", b"x")
 
     def do_task(self):
         self.bucket.upload_from_stream("gridfstest", self.document)
 
 
-class TestGridFsDownload(PerformanceTest, unittest.TestCase):
-    data_size = 52428800
-
+class TestGridFsDownload(GridFsTest, unittest.TestCase):
     def setUp(self):
-        self.client = client_context.client
-        self.client.drop_database("perftest")
-
-        gridfs_path = os.path.join(
-            TEST_PATH, os.path.join("single_and_multi_document", "gridfs_large.bin")
-        )
-
-        self.bucket = GridFSBucket(self.client.perftest)
-        with open(gridfs_path, "rb") as gfile:
-            self.uploaded_id = self.bucket.upload_from_stream("gridfstest", gfile)
-
-    def tearDown(self):
-        super().tearDown()
-        self.client.drop_database("perftest")
+        super().setUp()
+        self.uploaded_id = self.bucket.upload_from_stream("gridfstest", self.document)
 
     def do_task(self):
         self.bucket.open_download_stream(self.uploaded_id).read()
 
 
-proc_client = None
+proc_client: Optional[MongoClient] = None
 
 
 def proc_init(*dummy):