PYTHON-3823 Audit benchmark data_size and calculate dynamically it where possible (#1439)

This commit is contained in:
Shane Harvey 2023-11-17 12:07:33 -08:00 committed by GitHub
parent fc220532df
commit ec35f7f76e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -21,7 +21,7 @@ import sys
import tempfile
import time
import warnings
from typing import Any, List
from typing import Any, List, Optional
try:
import simplejson as json
@ -70,9 +70,8 @@ class Timer:
class PerformanceTest:
dataset: Any
data_size: Any
do_task: Any
dataset: str
data_size: int
fail: Any
@classmethod
@ -87,7 +86,9 @@ class PerformanceTest:
name = self.__class__.__name__[4:]
median = self.percentile(50)
megabytes_per_sec = self.data_size / median / 1000000
print(f"Running {self.__class__.__name__}. MEDIAN={self.percentile(50)}")
print(
f"Running {self.__class__.__name__}. MB/s={megabytes_per_sec}, MEDIAN={self.percentile(50)}"
)
result_data.append(
{
"info": {
@ -105,6 +106,9 @@ class PerformanceTest:
def before(self):
pass
def do_task(self):
raise NotImplementedError
def after(self):
pass
@ -120,12 +124,13 @@ class PerformanceTest:
def runTest(self):
results = []
start = time.monotonic()
self.max_iterations = NUM_ITERATIONS
for i in range(NUM_ITERATIONS):
if time.monotonic() - start > MAX_ITERATION_TIME:
with warnings.catch_warnings():
warnings.simplefilter("default")
warnings.warn("Test timed out, completed %s iterations." % i)
warnings.warn(
f"Test timed out after {MAX_ITERATION_TIME}s, completed {i}/{NUM_ITERATIONS} iterations."
)
break
self.before()
with Timer() as timer:
@ -142,6 +147,7 @@ class BsonEncodingTest(PerformanceTest):
# Location of test data.
with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
self.document = loads(data.read())
self.data_size = len(encode(self.document)) * NUM_DOCS
def do_task(self):
for _ in range(NUM_DOCS):
@ -154,6 +160,8 @@ class BsonDecodingTest(PerformanceTest):
with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
self.document = encode(json.loads(data.read()))
self.data_size = len(self.document) * NUM_DOCS
def do_task(self):
for _ in range(NUM_DOCS):
decode(self.document)
@ -161,37 +169,31 @@ class BsonDecodingTest(PerformanceTest):
class TestFlatEncoding(BsonEncodingTest, unittest.TestCase):
dataset = "flat_bson.json"
data_size = 75310000
class TestFlatDecoding(BsonDecodingTest, unittest.TestCase):
dataset = "flat_bson.json"
data_size = 75310000
class TestDeepEncoding(BsonEncodingTest, unittest.TestCase):
dataset = "deep_bson.json"
data_size = 19640000
class TestDeepDecoding(BsonDecodingTest, unittest.TestCase):
dataset = "deep_bson.json"
data_size = 19640000
class TestFullEncoding(BsonEncodingTest, unittest.TestCase):
dataset = "full_bson.json"
data_size = 57340000
class TestFullDecoding(BsonDecodingTest, unittest.TestCase):
dataset = "full_bson.json"
data_size = 57340000
# SINGLE-DOC BENCHMARKS
class TestRunCommand(PerformanceTest, unittest.TestCase):
data_size = 160000
data_size = len(encode({"hello": True})) * NUM_DOCS
def setUp(self):
self.client = client_context.client
@ -200,7 +202,7 @@ class TestRunCommand(PerformanceTest, unittest.TestCase):
def do_task(self):
command = self.client.perftest.command
for _ in range(NUM_DOCS):
command("ping")
command("hello", True)
class TestDocument(PerformanceTest):
@ -225,23 +227,17 @@ class TestDocument(PerformanceTest):
self.client.perftest.drop_collection("corpus")
class TestFindOneByID(TestDocument, unittest.TestCase):
data_size = 16220000
class FindTest(TestDocument):
dataset = "tweet.json"
def setUp(self):
self.dataset = "tweet.json"
super().setUp()
self.data_size = len(encode(self.document)) * NUM_DOCS
documents = [self.document.copy() for _ in range(NUM_DOCS)]
self.corpus = self.client.perftest.corpus
result = self.corpus.insert_many(documents)
self.inserted_ids = result.inserted_ids
def do_task(self):
find_one = self.corpus.find_one
for _id in self.inserted_ids:
find_one({"_id": _id})
def before(self):
pass
@ -249,30 +245,40 @@ class TestFindOneByID(TestDocument, unittest.TestCase):
pass
class TestSmallDocInsertOne(TestDocument, unittest.TestCase):
data_size = 2750000
class TestFindOneByID(FindTest, unittest.TestCase):
def do_task(self):
find_one = self.corpus.find_one
for _id in self.inserted_ids:
find_one({"_id": _id})
class SmallDocInsertTest(TestDocument):
dataset = "small_doc.json"
def setUp(self):
self.dataset = "small_doc.json"
super().setUp()
self.data_size = len(encode(self.document)) * NUM_DOCS
self.documents = [self.document.copy() for _ in range(NUM_DOCS)]
class TestSmallDocInsertOne(SmallDocInsertTest, unittest.TestCase):
def do_task(self):
insert_one = self.corpus.insert_one
for doc in self.documents:
insert_one(doc)
class TestLargeDocInsertOne(TestDocument, unittest.TestCase):
data_size = 27310890
class LargeDocInsertTest(TestDocument):
dataset = "large_doc.json"
def setUp(self):
self.dataset = "large_doc.json"
super().setUp()
n_docs = 10
self.data_size = len(encode(self.document)) * n_docs
self.documents = [self.document.copy() for _ in range(n_docs)]
self.documents = [self.document.copy() for _ in range(10)]
class TestLargeDocInsertOne(LargeDocInsertTest, unittest.TestCase):
def do_task(self):
insert_one = self.corpus.insert_one
for doc in self.documents:
@ -280,61 +286,24 @@ class TestLargeDocInsertOne(TestDocument, unittest.TestCase):
# MULTI-DOC BENCHMARKS
class TestFindManyAndEmptyCursor(TestDocument, unittest.TestCase):
data_size = 16220000
def setUp(self):
self.dataset = "tweet.json"
super().setUp()
for _ in range(10):
self.client.perftest.command("insert", "corpus", documents=[self.document] * 1000)
self.corpus = self.client.perftest.corpus
class TestFindManyAndEmptyCursor(FindTest, unittest.TestCase):
def do_task(self):
list(self.corpus.find())
def before(self):
pass
def after(self):
pass
class TestSmallDocBulkInsert(TestDocument, unittest.TestCase):
data_size = 2750000
def setUp(self):
self.dataset = "small_doc.json"
super().setUp()
self.documents = [self.document.copy() for _ in range(NUM_DOCS)]
def before(self):
self.corpus = self.client.perftest.create_collection("corpus")
class TestSmallDocBulkInsert(SmallDocInsertTest, unittest.TestCase):
def do_task(self):
self.corpus.insert_many(self.documents, ordered=True)
class TestLargeDocBulkInsert(TestDocument, unittest.TestCase):
data_size = 27310890
def setUp(self):
self.dataset = "large_doc.json"
super().setUp()
self.documents = [self.document.copy() for _ in range(10)]
def before(self):
self.corpus = self.client.perftest.create_collection("corpus")
class TestLargeDocBulkInsert(LargeDocInsertTest, unittest.TestCase):
def do_task(self):
self.corpus.insert_many(self.documents, ordered=True)
class TestGridFsUpload(PerformanceTest, unittest.TestCase):
data_size = 52428800
class GridFsTest(PerformanceTest):
def setUp(self):
super().setUp()
self.client = client_context.client
self.client.drop_database("perftest")
@ -343,44 +312,33 @@ class TestGridFsUpload(PerformanceTest, unittest.TestCase):
)
with open(gridfs_path, "rb") as data:
self.document = data.read()
self.data_size = len(self.document)
self.bucket = GridFSBucket(self.client.perftest)
def tearDown(self):
super().tearDown()
self.client.drop_database("perftest")
class TestGridFsUpload(GridFsTest, unittest.TestCase):
def before(self):
# Create the bucket.
self.bucket.upload_from_stream("init", b"x")
def do_task(self):
self.bucket.upload_from_stream("gridfstest", self.document)
class TestGridFsDownload(PerformanceTest, unittest.TestCase):
data_size = 52428800
class TestGridFsDownload(GridFsTest, unittest.TestCase):
def setUp(self):
self.client = client_context.client
self.client.drop_database("perftest")
gridfs_path = os.path.join(
TEST_PATH, os.path.join("single_and_multi_document", "gridfs_large.bin")
)
self.bucket = GridFSBucket(self.client.perftest)
with open(gridfs_path, "rb") as gfile:
self.uploaded_id = self.bucket.upload_from_stream("gridfstest", gfile)
def tearDown(self):
super().tearDown()
self.client.drop_database("perftest")
super().setUp()
self.uploaded_id = self.bucket.upload_from_stream("gridfstest", self.document)
def do_task(self):
self.bucket.open_download_stream(self.uploaded_id).read()
proc_client = None
proc_client: Optional[MongoClient] = None
def proc_init(*dummy):