mongo/buildscripts/cost_model/join_calibration_settings.py
Max Verbinnen 0285d3c416 SERVER-125617 Enable path arrayness for calibration suite (#52748)
GitOrigin-RevId: fe0e214a00a15bb6992bda16a21a022a65fb2401
2026-04-28 19:12:15 +00:00

154 lines
5.9 KiB
Python

# Copyright (C) 2026-present MongoDB, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Server Side Public License, version 1,
# as published by MongoDB, Inc.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Server Side Public License for more details.
#
# You should have received a copy of the Server Side Public License
# along with this program. If not, see
# <http://www.mongodb.com/licensing/server-side-public-license>.
#
# As a special exception, the copyright holders give permission to link the
# code of portions of this program with the OpenSSL library under certain
# conditions as described in each individual source file and distribute
# linked combinations including the program with the OpenSSL library. You
# must comply with the Server Side Public License in all respects for
# all of the code used other than as permitted herein. If you modify file(s)
# with this exception, you may extend this exception to your version of the
# file(s), but you are not obligated to do so. If you do not wish to do so,
# delete this exception statement from your version. If you delete this
# exception statement from all source files in the program, then also delete
# it in the license file.
#
"""Configuration for Join Cost Model Calibration.
Two collections (join_coll_1, join_coll_2) with identical schema:
- unique: Sequential unique integers in [1, COLLECTION_CARDINALITY]
- random: Random unique integers in [1, COLLECTION_CARDINALITY]
- uniform_16: Uniform random integers in [1, 16]
- uniform_256: Uniform random integers in [1, 256]
- uniform_4k: Uniform random integers in [1, 4096]
- uniform_64k: Uniform random integers in [1, 65536]
- string_filler: Random string to increase document size
Indexes (on join_coll_2 only, enabling INLJ plans):
{unique: 1}, {uniform_16: 1}, {uniform_256: 1}, {uniform_4k: 1}, {uniform_64k: 1}
"""
from __future__ import annotations
import os
import config
from random_generator import DataType, RandomDistribution, RangeGenerator, StringRandomDistribution
COLLECTION_CARDINALITY = 100_000
DEFAULT_STRING_FILLER_LENGTH = 900
LARGE_STRING_FILLER_LENGTH = 3_000
def create_join_collection_template(
name: str,
string_filler_length: int = DEFAULT_STRING_FILLER_LENGTH,
create_indexes: bool = False,
) -> config.CollectionTemplate:
"""Create a collection template for join calibration."""
return config.CollectionTemplate(
name=name,
fields=[
config.FieldTemplate(
name="unique",
data_type=DataType.INTEGER,
distribution=RandomDistribution.sequential(
RangeGenerator(DataType.INTEGER, 1, COLLECTION_CARDINALITY + 1)
),
indexed=create_indexes,
),
config.FieldTemplate(
name="random",
data_type=DataType.INTEGER,
distribution=RandomDistribution.permutation(
RangeGenerator(DataType.INTEGER, 1, COLLECTION_CARDINALITY + 1)
),
indexed=False,
),
config.FieldTemplate(
name="uniform_16",
data_type=DataType.INTEGER,
distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 17)),
indexed=create_indexes,
),
config.FieldTemplate(
name="uniform_256",
data_type=DataType.INTEGER,
distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 257)),
indexed=create_indexes,
),
config.FieldTemplate(
name="uniform_4k",
data_type=DataType.INTEGER,
distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 4097)),
indexed=create_indexes,
),
config.FieldTemplate(
name="uniform_64k",
data_type=DataType.INTEGER,
distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 65537)),
indexed=create_indexes,
),
config.FieldTemplate(
name="string_filler",
data_type=DataType.STRING,
distribution=StringRandomDistribution(string_filler_length, pool_size=1000),
indexed=False,
),
],
# Need an index for multikeyness info
compound_indexes=[
["dummy", "unique", "random", "uniform_16", "uniform_256", "uniform_4k", "uniform_64k"]
],
cardinalities=[COLLECTION_CARDINALITY],
)
join_coll_1 = create_join_collection_template(
name="join_coll_1",
create_indexes=False,
)
join_coll_1_large = create_join_collection_template(
name="join_coll_1_large",
string_filler_length=LARGE_STRING_FILLER_LENGTH,
create_indexes=False,
)
join_coll_2 = create_join_collection_template(
name="join_coll_2",
create_indexes=True,
)
join_coll_2_large = create_join_collection_template(
name="join_coll_2_large",
string_filler_length=LARGE_STRING_FILLER_LENGTH,
create_indexes=True,
)
join_database = config.DatabaseConfig(
connection_string=os.getenv("MONGODB_URI", "mongodb://localhost"),
database_name="join_calibration",
dump_path="~/mongo/buildscripts/cost_model",
restore_from_dump=config.RestoreMode.NEVER,
dump_on_exit=False,
)
join_data_generator = config.DataGeneratorConfig(
enabled=True,
create_indexes=True,
batch_size=COLLECTION_CARDINALITY,
collection_templates=[join_coll_1, join_coll_1_large, join_coll_2, join_coll_2_large],
write_mode=config.WriteMode.REPLACE,
collection_name_with_card=False,
)