diff --git a/buildscripts/cost_model/benchmark.py b/buildscripts/cost_model/benchmark.py index 19ffd100354..60143b1ec09 100644 --- a/buildscripts/cost_model/benchmark.py +++ b/buildscripts/cost_model/benchmark.py @@ -35,7 +35,7 @@ from dataclasses import asdict, dataclass from typing import Sequence import bson.json_util as json -import execution_tree +import execution_tree_sbe import physical_tree from config import BenchmarkConfig from database_instance import DatabaseInstance, Pipeline, get_database_parameter @@ -154,7 +154,7 @@ class ExperimentResult: explain: Sequence[dict[str, any]] physical_tree: Sequence[physical_tree.Node] - execution_tree: Sequence[execution_tree.Node] + execution_tree: Sequence[execution_tree_sbe.Node] mean: float def print(self, index: int = None): @@ -200,7 +200,7 @@ async def benchmark(config: BenchmarkConfig, database: DatabaseInstance, task: B def make_variant(explain: Sequence[dict[str, any]]) -> ExperimentResult: """Make one variant of the A/B test.""" pt = [physical_tree.build(e["queryPlanner"]["winningPlan"]["queryPlan"]) for e in explain] - et = [execution_tree.build_execution_tree(e["executionStats"]) for e in explain] + et = [execution_tree_sbe.build_execution_tree(e["executionStats"]) for e in explain] mean = sum(et.total_execution_time for et in et) / len(et) return ExperimentResult(explain=explain, physical_tree=pt, execution_tree=et, mean=mean) diff --git a/buildscripts/cost_model/common.py b/buildscripts/cost_model/common.py index 8820715e7ff..fca4b2ba9bc 100644 --- a/buildscripts/cost_model/common.py +++ b/buildscripts/cost_model/common.py @@ -30,6 +30,20 @@ import functools import time +""" +Returns a preorder traversal (node, child 1...child n) of the QSN/execution tree +For example, a query with an OR over 3 indices could turn into a tree rooted with a FETCH, +who has a single OR child, which in turn has 3 index scan children. +This would return a preorder of [FETCH, OR, IXSCAN1, IXSCAN2, IXSCAN3]. +""" + + +def preorder(node): + res = [node] + for child in node.children: + res += preorder(child) + return res + def timer_decorator(func): """Measure and print out execution time of the decorated function.""" diff --git a/buildscripts/cost_model/end_to_end.py b/buildscripts/cost_model/end_to_end.py index 32f181f60c5..72a7743edef 100644 --- a/buildscripts/cost_model/end_to_end.py +++ b/buildscripts/cost_model/end_to_end.py @@ -39,7 +39,7 @@ import os from typing import Callable, Sequence, Tuple import config -import execution_tree as et +import execution_tree_sbe as et import experiment as exp import numpy as np import pandas as pd diff --git a/buildscripts/cost_model/execution_tree_classic.py b/buildscripts/cost_model/execution_tree_classic.py new file mode 100644 index 00000000000..0dc8d4b911a --- /dev/null +++ b/buildscripts/cost_model/execution_tree_classic.py @@ -0,0 +1,148 @@ +# Copyright (C) 2025-present MongoDB, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the Server Side Public License, version 1, +# as published by MongoDB, Inc. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Server Side Public License for more details. +# +# You should have received a copy of the Server Side Public License +# along with this program. If not, see +# . +# +# As a special exception, the copyright holders give permission to link the +# code of portions of this program with the OpenSSL library under certain +# conditions as described in each individual source file and distribute +# linked combinations including the program with the OpenSSL library. You +# must comply with the Server Side Public License in all respects for +# all of the code used other than as permitted herein. If you modify file(s) +# with this exception, you may extend this exception to your version of the +# file(s), but you are not obligated to do so. If you do not wish to do so, +# delete this exception statement from your version. If you delete this +# exception statement from all source files in the program, then also delete +# it in the license file. +# + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Optional + +import bson.json_util as json + + +@dataclass +class Node: + """Represent Classic Execution Node""" + + stage: str + execution_time_nanoseconds: int + n_returned: int + n_processed: int + seeks: Optional[int] + children: list[Node] + + def get_execution_time(self): + """Execution time of this node without execution time of its children""" + return self.execution_time_nanoseconds - sum( + n.execution_time_nanoseconds for n in self.children + ) + + def print(self, level=0): + """Pretty print the execution tree""" + print( + f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}' + ) + for child in self.children: + child.print(level + 1) + + +def build_execution_tree(execution_stats: dict[str, Any]) -> Node: + """Build Classic execution tree from 'executionStats' field of query explain""" + assert execution_stats["executionSuccess"] + return process_stage(execution_stats["executionStages"]) + + +def process_stage(stage: dict[str, Any]) -> Node: + """Parse the given execution stage""" + processors = { + "SUBPLAN": process_subplan, + "COLLSCAN": process_collscan, + "IXSCAN": process_ixscan, + "FETCH": process_fetch, + "AND_HASH": process_intersection, + "AND_SORTED": process_intersection, + "OR": process_or, + "MERGE_SORT": process_mergesort, + "SORT_MERGE": process_mergesort, + "SORT": process_sort, + "LIMIT": process_limitskip, + "SKIP": process_limitskip, + } + processor = processors.get(stage["stage"]) + if processor is None: + print(json.dumps(stage, indent=4)) + raise ValueError(f"Unknown stage: {stage}") + + return processor(stage) + + +def process_subplan(stage: dict[str, Any]) -> Node: + input_stage = process_stage(stage["inputStage"]) + return Node(**get_common_fields(stage), n_processed=stage["nReturned"], children=[input_stage]) + + +def process_collscan(stage: dict[str, Any]) -> Node: + return Node(**get_common_fields(stage), n_processed=stage["docsExamined"], children=[]) + + +def process_ixscan(stage: dict[str, Any]) -> Node: + return Node(**get_common_fields(stage), n_processed=stage["keysExamined"], children=[]) + + +def process_fetch(stage: dict[str, Any]) -> Node: + input_stage = process_stage(stage["inputStage"]) + return Node( + **get_common_fields(stage), n_processed=stage["docsExamined"], children=[input_stage] + ) + + +def process_or(stage: dict[str, Any]) -> Node: + children = [process_stage(child) for child in stage["inputStages"]] + return Node(**get_common_fields(stage), n_processed=stage["nReturned"], children=children) + + +def process_intersection(stage: dict[str, Any]) -> Node: + children = [process_stage(child) for child in stage["inputStages"]] + n_processed = sum(child.n_processed for child in children) + return Node(**get_common_fields(stage), n_processed=n_processed, children=children) + + +def process_mergesort(stage: dict[str, Any]) -> Node: + children = [process_stage(child) for child in stage["inputStages"]] + return Node(**get_common_fields(stage), n_processed=stage["nReturned"], children=children) + + +def process_sort(stage: dict[str, Any]) -> Node: + input_stage = process_stage(stage["inputStage"]) + return Node(**get_common_fields(stage), n_processed=stage["nReturned"], children=[input_stage]) + + +def process_limitskip(stage: dict[str, Any]) -> Node: + input_stage = process_stage(stage["inputStage"]) + return Node( + **get_common_fields(stage), n_processed=input_stage.n_processed, children=[input_stage] + ) + + +def get_common_fields(json_stage: dict[str, Any]) -> dict[str, Any]: + """Extract common fields from classic nodes""" + return { + "stage": json_stage["stage"], + "execution_time_nanoseconds": json_stage["executionTimeNanos"], + "n_returned": json_stage["nReturned"], + "seeks": json_stage.get("seeks"), + } diff --git a/buildscripts/cost_model/execution_tree.py b/buildscripts/cost_model/execution_tree_sbe.py similarity index 85% rename from buildscripts/cost_model/execution_tree.py rename to buildscripts/cost_model/execution_tree_sbe.py index 84847b560fa..9f7ffd9e909 100644 --- a/buildscripts/cost_model/execution_tree.py +++ b/buildscripts/cost_model/execution_tree_sbe.py @@ -30,7 +30,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Optional +from typing import Any, Optional import bson.json_util as json @@ -50,7 +50,7 @@ class Node: children: list[Node] def get_execution_time(self): - """Execution time of the SBE node without execuion time of its children.""" + """Execution time of the SBE node without execution time of its children.""" return self.total_execution_time - sum(n.total_execution_time for n in self.children) def print(self, level=0): @@ -62,13 +62,13 @@ class Node: child.print(level + 1) -def build_execution_tree(execution_stats: dict[str, any]) -> Node: - """Build SBE executioon tree from 'executionStats' field of query explain.""" +def build_execution_tree(execution_stats: dict[str, Any]) -> Node: + """Build SBE execution tree from 'executionStats' field of query explain.""" assert execution_stats["executionSuccess"] return process_stage(execution_stats["executionStages"]) -def process_stage(stage: dict[str, any]) -> Node: +def process_stage(stage: dict[str, Any]) -> Node: """Parse the given SBE stage.""" processors = { "filter": process_filter, @@ -100,13 +100,13 @@ def process_stage(stage: dict[str, any]) -> Node: return processor(stage) -def process_filter(stage: dict[str, any]) -> Node: +def process_filter(stage: dict[str, Any]) -> Node: """Process filter stage.""" input_stage = process_stage(stage["inputStage"]) return Node(**get_common_fields(stage), n_processed=stage["numTested"], children=[input_stage]) -def process_traverse(stage: dict[str, any]) -> Node: +def process_traverse(stage: dict[str, Any]) -> Node: """Process traverse""" outer_stage = process_stage(stage["outerStage"]) inner_stage = process_stage(stage["innerStage"]) @@ -117,7 +117,7 @@ def process_traverse(stage: dict[str, any]) -> Node: ) -def process_hash_join_node(stage: dict[str, any]) -> Node: +def process_hash_join_node(stage: dict[str, Any]) -> Node: """Process hj node.""" outer_stage = process_stage(stage["outerStage"]) inner_stage = process_stage(stage["innerStage"]) @@ -127,7 +127,7 @@ def process_hash_join_node(stage: dict[str, any]) -> Node: ) -def process_nlj(stage: dict[str, any]) -> Node: +def process_nlj(stage: dict[str, Any]) -> Node: """Process nlj stage.""" outer_stage = process_stage(stage["outerStage"]) inner_stage = process_stage(stage["innerStage"]) @@ -137,7 +137,7 @@ def process_nlj(stage: dict[str, any]) -> Node: ) -def process_inner_node(stage: dict[str, any]) -> Node: +def process_inner_node(stage: dict[str, Any]) -> Node: """Process SBE stage with one input stage.""" input_stage = process_stage(stage["inputStage"]) return Node( @@ -145,23 +145,23 @@ def process_inner_node(stage: dict[str, any]) -> Node: ) -def process_leaf_node(stage: dict[str, any]) -> Node: +def process_leaf_node(stage: dict[str, Any]) -> Node: """Process SBE stage without input stages.""" return Node(**get_common_fields(stage), n_processed=stage["nReturned"], children=[]) -def process_seek(stage: dict[str, any]) -> Node: +def process_seek(stage: dict[str, Any]) -> Node: """Process seek stage.""" return Node(**get_common_fields(stage), n_processed=stage["numReads"], children=[]) -def process_union_node(stage: dict[str, any]) -> Node: +def process_union_node(stage: dict[str, Any]) -> Node: """Process union stage.""" children = [process_stage(child) for child in stage["inputStages"]] return Node(**get_common_fields(stage), n_processed=stage["nReturned"], children=children) -def process_unwind_node(stage: dict[str, any]) -> Node: +def process_unwind_node(stage: dict[str, Any]) -> Node: """Process unwind stage.""" input_stage = process_stage(stage["inputStage"]) return Node( @@ -169,14 +169,14 @@ def process_unwind_node(stage: dict[str, any]) -> Node: ) -def process_unique_node(stage: dict[str, any]) -> Node: +def process_unique_node(stage: dict[str, Any]) -> Node: """Process unique stage.""" input_stage = process_stage(stage["inputStage"]) n_processed = stage["dupsTested"] return Node(**get_common_fields(stage), n_processed=n_processed, children=[input_stage]) -def process_branch_node(stage: dict[str, any]) -> Node: +def process_branch_node(stage: dict[str, Any]) -> Node: """Process unique stage.""" then_stage = process_stage(stage["thenStage"]) else_stage = process_stage(stage["elseStage"]) @@ -186,8 +186,8 @@ def process_branch_node(stage: dict[str, any]) -> Node: ) -def get_common_fields(json_stage: dict[str, any]) -> dict[str, any]: - """Exctract common field from json representation of SBE stage.""" +def get_common_fields(json_stage: dict[str, Any]) -> dict[str, Any]: + """Extract common field from json representation of SBE stage.""" return { "stage": json_stage["stage"], "plan_node_id": json_stage["planNodeId"], diff --git a/buildscripts/cost_model/experiment.py b/buildscripts/cost_model/experiment.py index 27d2c40678c..aae5aead4b4 100644 --- a/buildscripts/cost_model/experiment.py +++ b/buildscripts/cost_model/experiment.py @@ -98,7 +98,7 @@ from __future__ import annotations import dataclasses import bson.json_util as json -import execution_tree as sbe +import execution_tree_sbe as sbe import pandas as pd import physical_tree as abt import seaborn as sns diff --git a/buildscripts/cost_model/parameters_extractor.py b/buildscripts/cost_model/parameters_extractor.py index 3c76e34092e..446d2d6350c 100644 --- a/buildscripts/cost_model/parameters_extractor.py +++ b/buildscripts/cost_model/parameters_extractor.py @@ -33,7 +33,7 @@ from collections import defaultdict, deque from typing import Callable, Mapping, Sequence, TypeVar import bson.json_util as json -import execution_tree +import execution_tree_sbe import physical_tree from config import AbtCalibratorConfig from cost_estimator import CostModelParameters, ExecutionStats @@ -87,9 +87,9 @@ def find_nodes(root: Node, predicate: Callable[[Node], bool]) -> list[Node]: return result -def get_excution_stats(root: execution_tree.Node, node_id: int) -> ExecutionStats: +def get_excution_stats(root: execution_tree_sbe.Node, node_id: int) -> ExecutionStats: """Extract execution stats from the given Execution Tree for the ABT node defined with the given node_id.""" - queue: deque[execution_tree.Node] = deque() + queue: deque[execution_tree_sbe.Node] = deque() queue.append(root) execution_time: int = 0 @@ -120,7 +120,7 @@ def parse_explain(explain: Mapping[str, any], abt_types: Sequence[str]): """Extract ExecutionStats from the given explain for the given ABT types.""" try: - et = execution_tree.build_execution_tree(explain["executionStats"]) + et = execution_tree_sbe.build_execution_tree(explain["executionStats"]) pt = physical_tree.build(explain["queryPlanner"]["winningPlan"]["queryPlan"]) except Exception as exception: print(f"*** Failed to parse explain with the followinf error: {exception}") @@ -131,7 +131,7 @@ def parse_explain(explain: Mapping[str, any], abt_types: Sequence[str]): def extract_execution_stats( - et: execution_tree.Node, pt: physical_tree.Node, abt_types: Sequence[str] + et: execution_tree_sbe.Node, pt: physical_tree.Node, abt_types: Sequence[str] ) -> Mapping[str, Sequence[ExecutionStats]]: """Extract ExecutionStats from the given SBE and ABT trees for the given ABT types.""" diff --git a/buildscripts/cost_model/qsn_costing_parameters.py b/buildscripts/cost_model/qsn_costing_parameters.py index bae23f34d5a..a17d88a6822 100644 --- a/buildscripts/cost_model/qsn_costing_parameters.py +++ b/buildscripts/cost_model/qsn_costing_parameters.py @@ -27,9 +27,10 @@ # """Prepare parameters for QSN cost calibration.""" -from typing import Callable, Optional, TypeVar +from typing import Any, Callable, Optional, TypeVar -import execution_tree as sbe +import execution_tree_classic as classic +import execution_tree_sbe as sbe import pandas as pd import query_solution_tree as qsn from workload_execution import QueryParameters @@ -37,12 +38,18 @@ from workload_execution import QueryParameters Node = TypeVar("Node") -def parse_explain(explain: dict[str, any]) -> (qsn.Node, sbe.Node): - qsn_tree = qsn.build(explain["queryPlanner"]["winningPlan"]["queryPlan"]) +def parse_explain_sbe(explain: dict[str, Any]) -> (qsn.Node, sbe.Node): + qsn_tree = qsn.build(explain["queryPlanner"]["winningPlan"]) sbe_tree = sbe.build_execution_tree(explain["executionStats"]) return (qsn_tree, sbe_tree) +def parse_explain_classic(explain: dict[str, Any]) -> (qsn.Node, classic.Node): + qsn_tree = qsn.build(explain["queryPlanner"]["winningPlan"]) + exec_tree = classic.build_execution_tree(explain["executionStats"]) + return (qsn_tree, exec_tree) + + def find_first_node(root: Node, predicate: Callable[[Node], bool]) -> Optional[Node]: """Find the first node in the given tree which satisfy the predicate.""" if predicate(root): @@ -69,7 +76,67 @@ def find_nodes(root: Node, predicate: Callable[[Node], bool]) -> list[Node]: return result -class ParametersBuilder: +class ParametersBuilderClassic: + """Prepare data for calibration from explain outputs.""" + + def __init__(self): + self.rows = [] + + def process(self, explain: dict[str, Any], params: QueryParameters): + qsn_tree, classic_tree = parse_explain_classic(explain) + self._process(qsn_tree, classic_tree, params) + + def buildDataFrame(self) -> pd.DataFrame: + return pd.DataFrame( + self.rows, + columns=[ + "stage", + "execution_time", + "n_processed", + "seeks", + "note", + "keys_length_in_bytes", + "average_document_size_in_bytes", + "number_of_fields", + ], + ) + + def _process(self, qsn_node: qsn.Node, node: classic.Node, params: QueryParameters): + self.rows.append(self._process_generic(qsn_node.node_type, node, params)) + # Set strict to true, as these should always have the same tree structure. + for qsn_child, classic_child in zip(qsn_node.children, node.children, strict=True): + self._process(qsn_child, classic_child, params) + + def _process_generic(self, stage: str, node: classic.Node, params: QueryParameters): + return ParametersBuilderClassic._build_row( + stage, + params, + execution_time=node.execution_time_nanoseconds, + n_processed=node.n_processed, + seeks=node.seeks, + ) + + @staticmethod + def _build_row( + stage: str, + params: QueryParameters, + execution_time: int = None, + n_processed: int = None, + seeks: int = None, + ): + return [ + stage, + execution_time, + n_processed, + seeks, + params.note, + params.keys_length_in_bytes, + params.average_document_size_in_bytes, + params.number_of_fields, + ] + + +class ParametersBuilderSBE: """Prepare data for calibration from explain outputs.""" def __init__(self): @@ -77,8 +144,8 @@ class ParametersBuilder: self.default_processor = self._process_generic self.rows = [] - def process(self, explain: dict[str, any], params: QueryParameters): - qsn_tree, sbe_tree = parse_explain(explain) + def process(self, explain: dict[str, Any], params: QueryParameters): + qsn_tree, sbe_tree = parse_explain_sbe(explain) self._process(qsn_tree, sbe_tree, params) def buildDataFrame(self) -> pd.DataFrame: @@ -111,7 +178,7 @@ class ParametersBuilder: nodes: list[sbe.Node] = find_nodes(sbe_tree, lambda node: node.plan_node_id == node_id) if len(nodes) == 0: raise ValueError(f"Cannot find sbe nodes of {stage}") - return ParametersBuilder._build_row( + return ParametersBuilderSBE._build_row( stage, params, execution_time=sum([node.get_execution_time() for node in nodes]), @@ -142,636 +209,549 @@ class ParametersBuilder: if __name__ == "__main__": import json - explain = """ + explain = r""" { - "explainVersion": "2", - "queryPlanner": { - "namespace": "calibration.index_scan_10000", - "indexFilterSet": false, - "parsedQuery": { - "$or": [ - { - "$and": [ - { - "as": { - "$lt": 4 - } - }, - { - "as": { - "$gt": 10 - } - } - ] - }, - { - "as": { - "$gt": 4 - } - } - ] - }, - "planCacheShapeHash": "971E822A", - "planCacheKey": "AA772AA3", - "optimizationTimeMillis": 0, - "maxIndexedOrSolutionsReached": false, - "maxIndexedAndSolutionsReached": false, - "maxScansToExplodeReached": false, - "winningPlan": { - "isCached": false, - "queryPlan": { - "stage": "FETCH", - "planNodeId": 5, - "inputStage": { - "stage": "OR", - "planNodeId": 4, - "inputStages": [ - { - "stage": "FETCH", - "planNodeId": 2, - "filter": { - "as": { - "$gt": 10 - } - }, - "inputStage": { - "stage": "IXSCAN", - "planNodeId": 1, - "keyPattern": { - "as": 1, - "mixed1": 1 - }, - "indexName": "as_1_mixed1_1", - "isMultiKey": true, - "multiKeyPaths": { - "as": [ - "as" - ], - "mixed1": [] - }, - "isUnique": false, - "isSparse": false, - "isPartial": false, - "indexVersion": 2, - "direction": "forward", - "indexBounds": { - "as": [ - "[-inf, 4)" - ], - "mixed1": [ - "[MinKey, MaxKey]" - ] - } - } - }, - { - "stage": "IXSCAN", - "planNodeId": 3, - "keyPattern": { - "as": 1, - "mixed1": 1 - }, - "indexName": "as_1_mixed1_1", - "isMultiKey": true, - "multiKeyPaths": { - "as": [ - "as" - ], - "mixed1": [] - }, - "isUnique": false, - "isSparse": false, - "isPartial": false, - "indexVersion": 2, - "direction": "forward", - "indexBounds": { - "as": [ - "(4, inf]" - ], - "mixed1": [ - "[MinKey, MaxKey]" - ] - } - } - ] - } - } - }, - "rejectedPlans": [] - }, - "executionStats": { - "executionSuccess": true, - "nReturned": 10000, - "executionTimeMillis": 48, - "totalKeysExamined": 49427, - "totalDocsExamined": 10030, - "executionStages": { - "stage": "nlj", - "planNodeId": 5, - "nReturned": 10000, - "executionTimeMillisEstimate": 46, - "executionTimeMicros": 46370, - "executionTimeNanos": 46370217, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "totalDocsExamined": 10030, - "totalKeysExamined": 49427, - "collectionScans": 0, - "collectionSeeks": 10030, - "indexScans": 0, - "indexSeeks": 2, - "indexesUsed": [ - "as_1_mixed1_1", - "as_1_mixed1_1" - ], - "innerOpens": 10000, - "innerCloses": 1, - "outerProjects": [], - "outerCorrelated": [ - { - "low": 21, - "high": 0, - "unsigned": false - }, - { - "low": 22, - "high": 0, - "unsigned": false - }, - { - "low": 18, - "high": 0, - "unsigned": false - }, - { - "low": 19, - "high": 0, - "unsigned": false - }, - { - "low": 20, - "high": 0, - "unsigned": false - } - ], - "outerStage": { - "stage": "unique", - "planNodeId": 4, - "nReturned": 10000, - "executionTimeMillisEstimate": 33, - "executionTimeMicros": 33589, - "executionTimeNanos": 33589954, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "dupsTested": 10030, - "dupsDropped": 30, - "keySlots": [ - { - "low": 21, - "high": 0, - "unsigned": false - } - ], - "inputStage": { - "stage": "union", - "planNodeId": 4, - "nReturned": 10030, - "executionTimeMillisEstimate": 30, - "executionTimeMicros": 30777, - "executionTimeNanos": 30777272, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "inputSlots": [ - { - "low": 5, - "high": 0, - "unsigned": false - }, - { - "low": 6, - "high": 0, - "unsigned": false - }, - { - "low": 7, - "high": 0, - "unsigned": false - }, - { - "low": 9, - "high": 0, - "unsigned": false - }, - { - "low": 4, - "high": 0, - "unsigned": false - }, - { - "low": 16, - "high": 0, - "unsigned": false - }, - { - "low": 17, - "high": 0, - "unsigned": false - }, - { - "low": 7, - "high": 0, - "unsigned": false - }, - { - "low": 14, - "high": 0, - "unsigned": false - }, - { - "low": 15, - "high": 0, - "unsigned": false - } - ], - "outputSlots": [ - { - "low": 18, - "high": 0, - "unsigned": false - }, - { - "low": 19, - "high": 0, - "unsigned": false - }, - { - "low": 20, - "high": 0, - "unsigned": false - }, - { - "low": 21, - "high": 0, - "unsigned": false - }, - { - "low": 22, - "high": 0, - "unsigned": false - } - ], - "inputStages": [ - { - "stage": "filter", - "planNodeId": 2, - "nReturned": 30, - "executionTimeMillisEstimate": 0, - "executionTimeMicros": 99, - "executionTimeNanos": 99405, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "numTested": 30, - "filter": "traverseF(s10, lambda(l101.0) { ((move(l101.0) > s11) ?: false) }, false) ", - "inputStage": { - "stage": "nlj", - "planNodeId": 2, - "nReturned": 30, - "executionTimeMillisEstimate": 0, - "executionTimeMicros": 93, - "executionTimeNanos": 93866, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "totalDocsExamined": 30, - "totalKeysExamined": 30, - "collectionScans": 0, - "collectionSeeks": 30, - "indexScans": 0, - "indexSeeks": 1, - "indexesUsed": [ - "as_1_mixed1_1" - ], - "innerOpens": 30, - "innerCloses": 1, - "outerProjects": [ - { - "low": 4, - "high": 0, - "unsigned": false - }, - { - "low": 5, - "high": 0, - "unsigned": false - }, - { - "low": 6, - "high": 0, - "unsigned": false - } - ], - "outerCorrelated": [ - { - "low": 3, - "high": 0, - "unsigned": false - }, - { - "low": 4, - "high": 0, - "unsigned": false - }, - { - "low": 5, - "high": 0, - "unsigned": false - }, - { - "low": 6, - "high": 0, - "unsigned": false - }, - { - "low": 7, - "high": 0, - "unsigned": false - } - ], - "outerStage": { - "stage": "unique", - "planNodeId": 1, - "nReturned": 30, - "executionTimeMillisEstimate": 0, - "executionTimeMicros": 32, - "executionTimeNanos": 32748, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "dupsTested": 30, - "dupsDropped": 0, - "keySlots": [ - { - "low": 3, - "high": 0, - "unsigned": false - } - ], - "inputStage": { - "stage": "cfilter", - "planNodeId": 1, - "nReturned": 30, - "executionTimeMillisEstimate": 0, - "executionTimeMicros": 24, - "executionTimeNanos": 24336, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "numTested": 1, - "filter": "(exists(s1) && exists(s2)) ", - "inputStage": { - "stage": "ixseek", - "planNodeId": 1, - "nReturned": 30, - "executionTimeMillisEstimate": 0, - "executionTimeMicros": 20, - "executionTimeNanos": 20902, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "indexName": "as_1_mixed1_1", - "keysExamined": 30, - "seeks": 1, - "numReads": 31, - "indexKeySlot": 6, - "recordIdSlot": 3, - "snapshotIdSlot": 4, - "indexIdentSlot": 5, - "outputSlots": [], - "indexKeysToInclude": "00000000000000000000000000000000", - "seekKeyLow": "s1 ", - "seekKeyHigh": "s2 " - } - } - }, - "innerStage": { - "stage": "limit", - "planNodeId": 2, - "nReturned": 30, - "executionTimeMillisEstimate": 0, - "executionTimeMicros": 54, - "executionTimeNanos": 54643, - "opens": 30, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "limit": 1, - "inputStage": { - "stage": "seek", - "planNodeId": 2, - "nReturned": 30, - "executionTimeMillisEstimate": 0, - "executionTimeMicros": 48, - "executionTimeNanos": 48162, - "opens": 30, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 0, - "numReads": 30, - "recordSlot": 8, - "recordIdSlot": 9, - "seekRecordIdSlot": 3, - "snapshotIdSlot": 4, - "indexIdentSlot": 5, - "indexKeySlot": 6, - "indexKeyPatternSlot": 7, - "scanFieldNames": [ - "as" - ], - "scanFieldSlots": [ - { - "low": 10, - "high": 0, - "unsigned": false - } - ] - } - } - } - }, - { - "stage": "unique", - "planNodeId": 3, - "nReturned": 10000, - "executionTimeMillisEstimate": 29, - "executionTimeMicros": 29986, - "executionTimeNanos": 29986183, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "dupsTested": 49397, - "dupsDropped": 39397, - "keySlots": [ - { - "low": 14, - "high": 0, - "unsigned": false - } - ], - "inputStage": { - "stage": "cfilter", - "planNodeId": 3, - "nReturned": 49397, - "executionTimeMillisEstimate": 22, - "executionTimeMicros": 22086, - "executionTimeNanos": 22086929, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "numTested": 1, - "filter": "(exists(s12) && exists(s13)) ", - "inputStage": { - "stage": "ixseek", - "planNodeId": 3, - "nReturned": 49397, - "executionTimeMillisEstimate": 18, - "executionTimeMicros": 18451, - "executionTimeNanos": 18451235, - "opens": 1, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "indexName": "as_1_mixed1_1", - "keysExamined": 49397, - "seeks": 1, - "numReads": 49398, - "indexKeySlot": 17, - "recordIdSlot": 14, - "snapshotIdSlot": 15, - "indexIdentSlot": 16, - "outputSlots": [], - "indexKeysToInclude": "00000000000000000000000000000000", - "seekKeyLow": "s12 ", - "seekKeyHigh": "s13 " - } - } - } - ] - } - }, - "innerStage": { - "stage": "limit", - "planNodeId": 5, - "nReturned": 10000, - "executionTimeMillisEstimate": 10, - "executionTimeMicros": 10779, - "executionTimeNanos": 10779434, - "opens": 10000, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 1, - "limit": 1, - "inputStage": { - "stage": "seek", - "planNodeId": 5, - "nReturned": 10000, - "executionTimeMillisEstimate": 8, - "executionTimeMicros": 8835, - "executionTimeNanos": 8835311, - "opens": 10000, - "closes": 1, - "saveState": 49, - "restoreState": 49, - "isEOF": 0, - "numReads": 10000, - "recordSlot": 23, - "recordIdSlot": 24, - "seekRecordIdSlot": 21, - "snapshotIdSlot": 22, - "indexIdentSlot": 18, - "indexKeySlot": 19, - "indexKeyPatternSlot": 20, - "scanFieldNames": [], - "scanFieldSlots": [] - } - } - }, - "allPlansExecution": [] - }, - "command": { - "find": "index_scan_10000", - "filter": { - "$or": [ - { - "as": { - "$gt": 10, - "$lt": 4 - } - }, - { - "as": { - "$gt": 4 - } - } - ] - }, - "$db": "calibration" - }, - "serverInfo": { - "host": "ip-10-122-6-29", - "port": 27017, - "version": "8.0.0-alpha", - "gitVersion": "unknown" - }, - "serverParameters": { - "internalQueryFacetBufferSizeBytes": 104857600, - "internalQueryFacetMaxOutputDocSizeBytes": 104857600, - "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600, - "internalDocumentSourceGroupMaxMemoryBytes": 104857600, - "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600, - "internalQueryProhibitBlockingMergeOnMongoS": 0, - "internalQueryMaxAddToSetBytes": 104857600, - "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600, - "internalQueryFrameworkControl": "trySbeEngine" - }, - "ok": 1 + "explainVersion" : "1", + "queryPlanner" : { + "namespace" : "test.and_sorted", + "parsedQuery" : { + "$or" : [ + { + "$and" : [ + { + "b" : { + "$eq" : "1" + } + }, + { + "a" : { + "$in" : [ + "1", + "2" + ] + } + } + ] + }, + { + "$and" : [ + { + "d" : { + "$eq" : "3" + } + }, + { + "e" : { + "$eq" : "3" + } + } + ] + }, + { + "$and" : [ + { + "f" : { + "$eq" : "4" + } + }, + { + "g" : { + "$eq" : "3" + } + } + ] + } + ] + }, + "indexFilterSet" : false, + "queryHash" : "AD50C8FE", + "planCacheShapeHash" : "AD50C8FE", + "planCacheKey" : "F134F6EA", + "optimizationTimeMillis" : 3, + "maxIndexedOrSolutionsReached" : false, + "maxIndexedAndSolutionsReached" : false, + "maxScansToExplodeReached" : false, + "prunedSimilarIndexes" : false, + "winningPlan" : { + "isCached" : false, + "stage" : "SUBPLAN", + "inputStage" : { + "stage" : "FETCH", + "inputStage" : { + "stage" : "SORT_MERGE", + "sortPattern" : { + "c" : 1 + }, + "inputStages" : [ + { + "stage" : "IXSCAN", + "keyPattern" : { + "a" : 1, + "b" : 1, + "c" : 1 + }, + "indexName" : "a_1_b_1_c_1", + "isMultiKey" : false, + "multiKeyPaths" : { + "a" : [ ], + "b" : [ ], + "c" : [ ] + }, + "isUnique" : false, + "isSparse" : false, + "isPartial" : false, + "indexVersion" : 2, + "direction" : "forward", + "indexBounds" : { + "a" : [ + "[\"1\", \"1\"]" + ], + "b" : [ + "[\"1\", \"1\"]" + ], + "c" : [ + "[MinKey, MaxKey]" + ] + } + }, + { + "stage" : "IXSCAN", + "keyPattern" : { + "a" : 1, + "b" : 1, + "c" : 1 + }, + "indexName" : "a_1_b_1_c_1", + "isMultiKey" : false, + "multiKeyPaths" : { + "a" : [ ], + "b" : [ ], + "c" : [ ] + }, + "isUnique" : false, + "isSparse" : false, + "isPartial" : false, + "indexVersion" : 2, + "direction" : "forward", + "indexBounds" : { + "a" : [ + "[\"2\", \"2\"]" + ], + "b" : [ + "[\"1\", \"1\"]" + ], + "c" : [ + "[MinKey, MaxKey]" + ] + } + }, + { + "stage" : "FETCH", + "filter" : { + "e" : { + "$eq" : "3" + } + }, + "inputStage" : { + "stage" : "IXSCAN", + "keyPattern" : { + "d" : 1, + "c" : 1 + }, + "indexName" : "d_1_c_1", + "isMultiKey" : false, + "multiKeyPaths" : { + "d" : [ ], + "c" : [ ] + }, + "isUnique" : false, + "isSparse" : false, + "isPartial" : false, + "indexVersion" : 2, + "direction" : "forward", + "indexBounds" : { + "d" : [ + "[\"3\", \"3\"]" + ], + "c" : [ + "[MinKey, MaxKey]" + ] + } + } + }, + { + "stage" : "FETCH", + "filter" : { + "g" : { + "$eq" : "3" + } + }, + "inputStage" : { + "stage" : "IXSCAN", + "keyPattern" : { + "f" : 1, + "c" : 1 + }, + "indexName" : "f_1_c_1", + "isMultiKey" : false, + "multiKeyPaths" : { + "f" : [ ], + "c" : [ ] + }, + "isUnique" : false, + "isSparse" : false, + "isPartial" : false, + "indexVersion" : 2, + "direction" : "forward", + "indexBounds" : { + "f" : [ + "[\"4\", \"4\"]" + ], + "c" : [ + "[MinKey, MaxKey]" + ] + } + } + } + ] + } + } + }, + "rejectedPlans" : [ ] + }, + "executionStats" : { + "executionSuccess" : true, + "nReturned" : 10, + "executionTimeMillis" : 5, + "totalKeysExamined" : 10, + "totalDocsExamined" : 19, + "executionStages" : { + "isCached" : false, + "stage" : "SUBPLAN", + "nReturned" : 10, + "executionTimeMillisEstimate" : 3, + "executionTimeMicros" : 3368, + "executionTimeNanos" : 3368499, + "works" : 25, + "advanced" : 10, + "needTime" : 14, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "inputStage" : { + "stage" : "FETCH", + "nReturned" : 10, + "executionTimeMillisEstimate" : 1, + "executionTimeMicros" : 1046, + "executionTimeNanos" : 1046324, + "works" : 24, + "advanced" : 10, + "needTime" : 14, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "docsExamined" : 10, + "alreadyHasObj" : 9, + "inputStage" : { + "stage" : "SORT_MERGE", + "nReturned" : 10, + "executionTimeMillisEstimate" : 0, + "executionTimeMicros" : 971, + "executionTimeNanos" : 971927, + "works" : 24, + "advanced" : 10, + "needTime" : 14, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "sortPattern" : { + "c" : 1 + }, + "dupsTested" : 10, + "dupsDropped" : 0, + "inputStages" : [ + { + "stage" : "IXSCAN", + "nReturned" : 0, + "executionTimeMillisEstimate" : 0, + "executionTimeMicros" : 111, + "executionTimeNanos" : 111205, + "works" : 1, + "advanced" : 0, + "needTime" : 0, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "keyPattern" : { + "a" : 1, + "b" : 1, + "c" : 1 + }, + "indexName" : "a_1_b_1_c_1", + "isMultiKey" : false, + "multiKeyPaths" : { + "a" : [ ], + "b" : [ ], + "c" : [ ] + }, + "isUnique" : false, + "isSparse" : false, + "isPartial" : false, + "indexVersion" : 2, + "direction" : "forward", + "indexBounds" : { + "a" : [ + "[\"1\", \"1\"]" + ], + "b" : [ + "[\"1\", \"1\"]" + ], + "c" : [ + "[MinKey, MaxKey]" + ] + }, + "keysExamined" : 0, + "seeks" : 1, + "dupsTested" : 0, + "dupsDropped" : 0 + }, + { + "stage" : "IXSCAN", + "nReturned" : 1, + "executionTimeMillisEstimate" : 0, + "executionTimeMicros" : 85, + "executionTimeNanos" : 85605, + "works" : 2, + "advanced" : 1, + "needTime" : 0, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "keyPattern" : { + "a" : 1, + "b" : 1, + "c" : 1 + }, + "indexName" : "a_1_b_1_c_1", + "isMultiKey" : false, + "multiKeyPaths" : { + "a" : [ ], + "b" : [ ], + "c" : [ ] + }, + "isUnique" : false, + "isSparse" : false, + "isPartial" : false, + "indexVersion" : 2, + "direction" : "forward", + "indexBounds" : { + "a" : [ + "[\"2\", \"2\"]" + ], + "b" : [ + "[\"1\", \"1\"]" + ], + "c" : [ + "[MinKey, MaxKey]" + ] + }, + "keysExamined" : 1, + "seeks" : 1, + "dupsTested" : 0, + "dupsDropped" : 0 + }, + { + "stage" : "FETCH", + "filter" : { + "e" : { + "$eq" : "3" + } + }, + "nReturned" : 8, + "executionTimeMillisEstimate" : 0, + "executionTimeMicros" : 446, + "executionTimeNanos" : 446632, + "works" : 9, + "advanced" : 8, + "needTime" : 0, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "docsExamined" : 8, + "alreadyHasObj" : 0, + "inputStage" : { + "stage" : "IXSCAN", + "nReturned" : 8, + "executionTimeMillisEstimate" : 0, + "executionTimeMicros" : 258, + "executionTimeNanos" : 258120, + "works" : 9, + "advanced" : 8, + "needTime" : 0, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "keyPattern" : { + "d" : 1, + "c" : 1 + }, + "indexName" : "d_1_c_1", + "isMultiKey" : false, + "multiKeyPaths" : { + "d" : [ ], + "c" : [ ] + }, + "isUnique" : false, + "isSparse" : false, + "isPartial" : false, + "indexVersion" : 2, + "direction" : "forward", + "indexBounds" : { + "d" : [ + "[\"3\", \"3\"]" + ], + "c" : [ + "[MinKey, MaxKey]" + ] + }, + "keysExamined" : 8, + "seeks" : 1, + "dupsTested" : 0, + "dupsDropped" : 0 + } + }, + { + "stage" : "FETCH", + "filter" : { + "g" : { + "$eq" : "3" + } + }, + "nReturned" : 1, + "executionTimeMillisEstimate" : 0, + "executionTimeMicros" : 124, + "executionTimeNanos" : 124694, + "works" : 2, + "advanced" : 1, + "needTime" : 0, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "docsExamined" : 1, + "alreadyHasObj" : 0, + "inputStage" : { + "stage" : "IXSCAN", + "nReturned" : 1, + "executionTimeMillisEstimate" : 0, + "executionTimeMicros" : 84, + "executionTimeNanos" : 84292, + "works" : 2, + "advanced" : 1, + "needTime" : 0, + "needYield" : 0, + "saveState" : 0, + "restoreState" : 0, + "isEOF" : 1, + "keyPattern" : { + "f" : 1, + "c" : 1 + }, + "indexName" : "f_1_c_1", + "isMultiKey" : false, + "multiKeyPaths" : { + "f" : [ ], + "c" : [ ] + }, + "isUnique" : false, + "isSparse" : false, + "isPartial" : false, + "indexVersion" : 2, + "direction" : "forward", + "indexBounds" : { + "f" : [ + "[\"4\", \"4\"]" + ], + "c" : [ + "[MinKey, MaxKey]" + ] + }, + "keysExamined" : 1, + "seeks" : 1, + "dupsTested" : 0, + "dupsDropped" : 0 + } + } + ] + } + } + } + }, + "queryShapeHash" : "ED0570742F8B713F6AB10101FE755DD90D901D507338F6191424BE1F16CC9C9D", + "command" : { + "find" : "and_sorted", + "filter" : { + "$or" : [ + { + "a" : { + "$in" : [ + "1", + "2" + ] + }, + "b" : "1" + }, + { + "d" : "3", + "e" : "3" + }, + { + "f" : "4", + "g" : "3" + } + ] + }, + "sort" : { + "c" : 1 + }, + "$db" : "test" + }, + "serverInfo" : { + "host" : "ip-10-122-2-255", + "port" : 27017, + "version" : "8.2.0-alpha", + "gitVersion" : "nogitversion" + }, + "serverParameters" : { + "internalQueryFacetBufferSizeBytes" : 104857600, + "internalQueryFacetMaxOutputDocSizeBytes" : 104857600, + "internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600, + "internalDocumentSourceGroupMaxMemoryBytes" : 104857600, + "internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600, + "internalQueryProhibitBlockingMergeOnMongoS" : 0, + "internalQueryMaxAddToSetBytes" : 104857600, + "internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600, + "internalQueryFrameworkControl" : "trySbeRestricted", + "internalQueryPlannerIgnoreIndexWithCollationForRegex" : 1 + }, + "ok" : 1 } """ explainJson = json.loads(explain) - qsn_tree, sbe_tree = parse_explain(explainJson) + qsn_tree, exec_tree = parse_explain_classic(explainJson) qsn_tree.print() - sbe_tree.print() + exec_tree.print() params = QueryParameters(10, 2000, "rooted-or") - builder = ParametersBuilder() + builder = ParametersBuilderClassic() builder.process(explainJson, params) df = builder.buildDataFrame() print(df) diff --git a/buildscripts/cost_model/query_solution_tree.py b/buildscripts/cost_model/query_solution_tree.py index 4dfdc4e7c02..94a6ebe2435 100644 --- a/buildscripts/cost_model/query_solution_tree.py +++ b/buildscripts/cost_model/query_solution_tree.py @@ -30,6 +30,7 @@ from __future__ import annotations from dataclasses import dataclass +from typing import Any __all__ = ["Node", "build"] @@ -39,32 +40,29 @@ class Node: """Represent Query Solution node.""" node_type: str - plan_node_id: int children: list[Node] def print(self, level=0): """Pretty print of the QSN tree.""" - print(f'{"| "*level}{self.node_type}, planNodeId: {self.plan_node_id}') + print(f'{"| "*level}{self.node_type}') for child in self.children: child.print(level + 1) -def build(optimizer_plan: dict[str, any]) -> Node: +def build(optimizer_plan: dict[str, Any]) -> Node: """Build QSN tree from query explain.""" return parse_optimizer_node(optimizer_plan) -def parse_optimizer_node(explain_node: dict[str, any]) -> Node: +def parse_optimizer_node(explain_node: dict[str, Any]) -> Node: """Recursively parse QSN from query explain's node.""" children = get_children(explain_node) - return Node( - node_type=explain_node["stage"], plan_node_id=explain_node["planNodeId"], children=children - ) + return Node(node_type=explain_node["stage"], children=children) -def get_children(explain_node: dict[str, any]) -> list[Node]: +def get_children(explain_node: dict[str, Any]) -> list[Node]: """Get children nodes of the QSN.""" if "inputStage" in explain_node: