mongo-python-driver/test/performance/perf_test.py

# Copyright 2015 MongoDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for the MongoDB Driver Performance Benchmarking Spec."""

import multiprocessing as mp
import os
import sys
import tempfile
import time
import warnings
from typing import Any, List

try:
    import simplejson as json
except ImportError:
    import json  # type: ignore[no-redef]

sys.path[0:0] = [""]

from test import client_context, host, port, unittest

from bson import decode, encode
from bson.json_util import loads
from gridfs import GridFSBucket
from pymongo import MongoClient

NUM_ITERATIONS = 100
MAX_ITERATION_TIME = 300
NUM_DOCS = 10000

TEST_PATH = os.environ.get(
    "TEST_PATH", os.path.join(os.path.dirname(os.path.realpath(__file__)), os.path.join("data"))
)

OUTPUT_FILE = os.environ.get("OUTPUT_FILE")

result_data: List = []


def tearDownModule():
    output = json.dumps(result_data, indent=4)
    if OUTPUT_FILE:
        with open(OUTPUT_FILE, "w") as opf:
            opf.write(output)
    else:
        print(output)


class Timer:
    def __enter__(self):
        self.start = time.monotonic()
        return self

    def __exit__(self, *args):
        self.end = time.monotonic()
        self.interval = self.end - self.start


class PerformanceTest:
    dataset: Any
    data_size: Any
    do_task: Any
    fail: Any

    @classmethod
    def setUpClass(cls):
        client_context.init()

    def setUp(self):
        pass

    def tearDown(self):
        name = self.__class__.__name__
        median = self.percentile(50)
        bytes_per_sec = self.data_size / median
        print(f"Running {self.__class__.__name__}. MEDIAN={self.percentile(50)}")
        result_data.append(
            {
                "info": {
                    "test_name": name,
                    "args": {
                        "threads": 1,
                    },
                },
                "metrics": [
                    {"name": "bytes_per_sec", "value": bytes_per_sec},
                ],
            }
        )

    def before(self):
        pass

    def after(self):
        pass

    def percentile(self, percentile):
        if hasattr(self, "results"):
            sorted_results = sorted(self.results)
            percentile_index = int(len(sorted_results) * percentile / 100) - 1
            return sorted_results[percentile_index]
        else:
            self.fail("Test execution failed")
            return None

    def runTest(self):
        results = []
        start = time.monotonic()
        self.max_iterations = NUM_ITERATIONS
        for i in range(NUM_ITERATIONS):
            if time.monotonic() - start > MAX_ITERATION_TIME:
                warnings.warn("Test timed out, completed %s iterations." % i)
                break
            self.before()
            with Timer() as timer:
                self.do_task()
            self.after()
            results.append(timer.interval)

        self.results = results


# BSON MICRO-BENCHMARKS
class BsonEncodingTest(PerformanceTest):
    def setUp(self):
        # Location of test data.
        with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
            self.document = loads(data.read())

    def do_task(self):
        for _ in range(NUM_DOCS):
            encode(self.document)


class BsonDecodingTest(PerformanceTest):
    def setUp(self):
        # Location of test data.
        with open(os.path.join(TEST_PATH, os.path.join("extended_bson", self.dataset))) as data:
            self.document = encode(json.loads(data.read()))

    def do_task(self):
        for _ in range(NUM_DOCS):
            decode(self.document)


class TestFlatEncoding(BsonEncodingTest, unittest.TestCase):
    dataset = "flat_bson.json"
    data_size = 75310000


class TestFlatDecoding(BsonDecodingTest, unittest.TestCase):
    dataset = "flat_bson.json"
    data_size = 75310000


class TestDeepEncoding(BsonEncodingTest, unittest.TestCase):
    dataset = "deep_bson.json"
    data_size = 19640000


class TestDeepDecoding(BsonDecodingTest, unittest.TestCase):
    dataset = "deep_bson.json"
    data_size = 19640000


class TestFullEncoding(BsonEncodingTest, unittest.TestCase):
    dataset = "full_bson.json"
    data_size = 57340000


class TestFullDecoding(BsonDecodingTest, unittest.TestCase):
    dataset = "full_bson.json"
    data_size = 57340000


# SINGLE-DOC BENCHMARKS
class TestRunCommand(PerformanceTest, unittest.TestCase):
    data_size = 160000

    def setUp(self):
        self.client = client_context.client
        self.client.drop_database("perftest")

    def do_task(self):
        command = self.client.perftest.command
        for _ in range(NUM_DOCS):
            command("ping")


class TestDocument(PerformanceTest):
    def setUp(self):
        # Location of test data.
        with open(
            os.path.join(TEST_PATH, os.path.join("single_and_multi_document", self.dataset))
        ) as data:
            self.document = json.loads(data.read())

        self.client = client_context.client
        self.client.drop_database("perftest")

    def tearDown(self):
        super().tearDown()
        self.client.drop_database("perftest")

    def before(self):
        self.corpus = self.client.perftest.create_collection("corpus")

    def after(self):
        self.client.perftest.drop_collection("corpus")


class TestFindOneByID(TestDocument, unittest.TestCase):
    data_size = 16220000

    def setUp(self):
        self.dataset = "tweet.json"
        super().setUp()

        documents = [self.document.copy() for _ in range(NUM_DOCS)]
        self.corpus = self.client.perftest.corpus
        result = self.corpus.insert_many(documents)
        self.inserted_ids = result.inserted_ids

    def do_task(self):
        find_one = self.corpus.find_one
        for _id in self.inserted_ids:
            find_one({"_id": _id})

    def before(self):
        pass

    def after(self):
        pass


class TestSmallDocInsertOne(TestDocument, unittest.TestCase):
    data_size = 2750000

    def setUp(self):
        self.dataset = "small_doc.json"
        super().setUp()

        self.documents = [self.document.copy() for _ in range(NUM_DOCS)]

    def do_task(self):
        insert_one = self.corpus.insert_one
        for doc in self.documents:
            insert_one(doc)


class TestLargeDocInsertOne(TestDocument, unittest.TestCase):
    data_size = 27310890

    def setUp(self):
        self.dataset = "large_doc.json"
        super().setUp()

        self.documents = [self.document.copy() for _ in range(10)]

    def do_task(self):
        insert_one = self.corpus.insert_one
        for doc in self.documents:
            insert_one(doc)


# MULTI-DOC BENCHMARKS
class TestFindManyAndEmptyCursor(TestDocument, unittest.TestCase):
    data_size = 16220000

    def setUp(self):
        self.dataset = "tweet.json"
        super().setUp()

        for _ in range(10):
            self.client.perftest.command("insert", "corpus", documents=[self.document] * 1000)
        self.corpus = self.client.perftest.corpus

    def do_task(self):
        list(self.corpus.find())

    def before(self):
        pass

    def after(self):
        pass


class TestSmallDocBulkInsert(TestDocument, unittest.TestCase):
    data_size = 2750000

    def setUp(self):
        self.dataset = "small_doc.json"
        super().setUp()
        self.documents = [self.document.copy() for _ in range(NUM_DOCS)]

    def before(self):
        self.corpus = self.client.perftest.create_collection("corpus")

    def do_task(self):
        self.corpus.insert_many(self.documents, ordered=True)


class TestLargeDocBulkInsert(TestDocument, unittest.TestCase):
    data_size = 27310890

    def setUp(self):
        self.dataset = "large_doc.json"
        super().setUp()
        self.documents = [self.document.copy() for _ in range(10)]

    def before(self):
        self.corpus = self.client.perftest.create_collection("corpus")

    def do_task(self):
        self.corpus.insert_many(self.documents, ordered=True)


class TestGridFsUpload(PerformanceTest, unittest.TestCase):
    data_size = 52428800

    def setUp(self):
        self.client = client_context.client
        self.client.drop_database("perftest")

        gridfs_path = os.path.join(
            TEST_PATH, os.path.join("single_and_multi_document", "gridfs_large.bin")
        )
        with open(gridfs_path, "rb") as data:
            self.document = data.read()

        self.bucket = GridFSBucket(self.client.perftest)

    def tearDown(self):
        super().tearDown()
        self.client.drop_database("perftest")

    def before(self):
        self.bucket.upload_from_stream("init", b"x")

    def do_task(self):
        self.bucket.upload_from_stream("gridfstest", self.document)


class TestGridFsDownload(PerformanceTest, unittest.TestCase):
    data_size = 52428800

    def setUp(self):
        self.client = client_context.client
        self.client.drop_database("perftest")

        gridfs_path = os.path.join(
            TEST_PATH, os.path.join("single_and_multi_document", "gridfs_large.bin")
        )

        self.bucket = GridFSBucket(self.client.perftest)
        with open(gridfs_path, "rb") as gfile:
            self.uploaded_id = self.bucket.upload_from_stream("gridfstest", gfile)

    def tearDown(self):
        super().tearDown()
        self.client.drop_database("perftest")

    def do_task(self):
        self.bucket.open_download_stream(self.uploaded_id).read()


proc_client = None


def proc_init(*dummy):
    global proc_client
    proc_client = MongoClient(host, port)


# PARALLEL BENCHMARKS
def mp_map(map_func, files):
    pool = mp.Pool(initializer=proc_init)
    pool.map(map_func, files)
    pool.close()


def insert_json_file(filename):
    assert proc_client is not None
    with open(filename) as data:
        coll = proc_client.perftest.corpus
        coll.insert_many([json.loads(line) for line in data])


def insert_json_file_with_file_id(filename):
    documents = []
    with open(filename) as data:
        for line in data:
            doc = json.loads(line)
            doc["file"] = filename
            documents.append(doc)
    assert proc_client is not None
    coll = proc_client.perftest.corpus
    coll.insert_many(documents)


def read_json_file(filename):
    assert proc_client is not None
    coll = proc_client.perftest.corpus
    temp = tempfile.TemporaryFile(mode="w")
    try:
        temp.writelines(
            [json.dumps(doc) + "\n" for doc in coll.find({"file": filename}, {"_id": False})]
        )
    finally:
        temp.close()


def insert_gridfs_file(filename):
    assert proc_client is not None
    bucket = GridFSBucket(proc_client.perftest)

    with open(filename, "rb") as gfile:
        bucket.upload_from_stream(filename, gfile)


def read_gridfs_file(filename):
    assert proc_client is not None
    bucket = GridFSBucket(proc_client.perftest)

    temp = tempfile.TemporaryFile()
    try:
        bucket.download_to_stream_by_name(filename, temp)
    finally:
        temp.close()


class TestJsonMultiImport(PerformanceTest, unittest.TestCase):
    data_size = 565000000

    def setUp(self):
        self.client = client_context.client
        self.client.drop_database("perftest")

    def before(self):
        self.client.perftest.command({"create": "corpus"})
        self.corpus = self.client.perftest.corpus

        ldjson_path = os.path.join(TEST_PATH, os.path.join("parallel", "ldjson_multi"))
        self.files = [os.path.join(ldjson_path, s) for s in os.listdir(ldjson_path)]

    def do_task(self):
        mp_map(insert_json_file, self.files)

    def after(self):
        self.client.perftest.drop_collection("corpus")

    def tearDown(self):
        super().tearDown()
        self.client.drop_database("perftest")


class TestJsonMultiExport(PerformanceTest, unittest.TestCase):
    data_size = 565000000

    def setUp(self):
        self.client = client_context.client
        self.client.drop_database("perftest")
        self.client.perfest.corpus.create_index("file")

        ldjson_path = os.path.join(TEST_PATH, os.path.join("parallel", "ldjson_multi"))
        self.files = [os.path.join(ldjson_path, s) for s in os.listdir(ldjson_path)]

        mp_map(insert_json_file_with_file_id, self.files)

    def do_task(self):
        mp_map(read_json_file, self.files)

    def tearDown(self):
        super().tearDown()
        self.client.drop_database("perftest")


class TestGridFsMultiFileUpload(PerformanceTest, unittest.TestCase):
    data_size = 262144000

    def setUp(self):
        self.client = client_context.client
        self.client.drop_database("perftest")

    def before(self):
        self.client.perftest.drop_collection("fs.files")
        self.client.perftest.drop_collection("fs.chunks")

        self.bucket = GridFSBucket(self.client.perftest)
        gridfs_path = os.path.join(TEST_PATH, os.path.join("parallel", "gridfs_multi"))
        self.files = [os.path.join(gridfs_path, s) for s in os.listdir(gridfs_path)]

    def do_task(self):
        mp_map(insert_gridfs_file, self.files)

    def tearDown(self):
        super().tearDown()
        self.client.drop_database("perftest")


class TestGridFsMultiFileDownload(PerformanceTest, unittest.TestCase):
    data_size = 262144000

    def setUp(self):
        self.client = client_context.client
        self.client.drop_database("perftest")

        bucket = GridFSBucket(self.client.perftest)

        gridfs_path = os.path.join(TEST_PATH, os.path.join("parallel", "gridfs_multi"))
        self.files = [os.path.join(gridfs_path, s) for s in os.listdir(gridfs_path)]

        for fname in self.files:
            with open(fname, "rb") as gfile:
                bucket.upload_from_stream(fname, gfile)

    def do_task(self):
        mp_map(read_gridfs_file, self.files)

    def tearDown(self):
        super().tearDown()
        self.client.drop_database("perftest")


if __name__ == "__main__":
    unittest.main()