Skip to content

Commit

Permalink
Reapply changes from #7125
Browse files Browse the repository at this point in the history
Co-authored by: Noah Holm <32292420+noppaz@users.noreply.github.com>
Co-authored by: Jeremy Cohen <jeremy@dbtlabs.com>
  • Loading branch information
noppaz authored and jtcohen6 committed Dec 24, 2024
1 parent 459d156 commit 704120c
Show file tree
Hide file tree
Showing 11 changed files with 62 additions and 20 deletions.
6 changes: 6 additions & 0 deletions .changes/unreleased/Features-20230307-134838.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Features
body: Make MAXIMUM_SEED_SIZE_MIB configurable
time: 2023-03-07T13:48:38.792321024Z
custom:
Author: noppaz acurtis-evi
Issue: 7117 7124
22 changes: 22 additions & 0 deletions core/dbt/artifacts/resources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List, Optional

from dbt.artifacts.resources.types import NodeType
from dbt_common.clients.system import convert_path
from dbt_common.dataclass_schema import dbtClassMixin


Expand Down Expand Up @@ -60,6 +61,27 @@ def from_contents(cls, contents: str, name="sha256") -> "FileHash":
checksum = hashlib.new(name, data).hexdigest()
return cls(name=name, checksum=checksum)

@classmethod
def from_path(cls, path: str, name="sha256") -> "FileHash":
"""Create a file hash from the file at given path. The hash is always the
utf-8 encoding of the contents which is stripped to give similar hashes
as `FileHash.from_contents`.
"""
path = convert_path(path)
chunk_size = 1 * 1024 * 1024
file_hash = hashlib.new(name)
with open(path, "r") as handle:

Check warning on line 73 in core/dbt/artifacts/resources/base.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/artifacts/resources/base.py#L70-L73

Added lines #L70 - L73 were not covered by tests
# Left and rightstrip start and end of contents to give identical
# results as the seed hashing implementation with from_contents
chunk = handle.read(chunk_size).lstrip()
while chunk:
next_chunk = handle.read(chunk_size)
if not next_chunk:
chunk = chunk.rstrip()
file_hash.update(chunk.encode("utf-8"))
chunk = next_chunk
return cls(name=name, checksum=file_hash.hexdigest())

Check warning on line 83 in core/dbt/artifacts/resources/base.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/artifacts/resources/base.py#L76-L83

Added lines #L76 - L83 were not covered by tests


@dataclass
class Docs(dbtClassMixin):
Expand Down
1 change: 1 addition & 0 deletions core/dbt/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def global_flags(func):
@p.log_level_file
@p.log_path
@p.macro_debugging
@p.maximum_seed_size_mib
@p.partial_parse
@p.partial_parse_file_path
@p.partial_parse_file_diff
Expand Down
8 changes: 8 additions & 0 deletions core/dbt/cli/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,14 @@
default="eager",
)

maximum_seed_size_mib = click.option(
"--maximum-seed-size-mib",
envvar="DBT_MAXIMUM_SEED_SIZE_MIB",
help="Specify max size (MiB) for seed files that will be hashed for state comparison.",
type=click.INT,
default=1,
)

lock = click.option(
"--lock",
envvar=None,
Expand Down
3 changes: 0 additions & 3 deletions core/dbt/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@

SECRET_PLACEHOLDER = "$$$DBT_SECRET_START$$${}$$$DBT_SECRET_END$$$"

MAXIMUM_SEED_SIZE = 1 * 1024 * 1024
MAXIMUM_SEED_SIZE_NAME = "1MB"

PIN_PACKAGE_URL = (
"https://docs.getdbt.com/docs/package-management#section-specifying-package-versions"
)
Expand Down
6 changes: 2 additions & 4 deletions core/dbt/contracts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from mashumaro.types import SerializableType

from dbt.artifacts.resources.base import FileHash
from dbt.constants import MAXIMUM_SEED_SIZE
from dbt_common.dataclass_schema import StrEnum, dbtClassMixin

from .util import SourceKey
Expand Down Expand Up @@ -65,9 +64,8 @@ def absolute_path(self) -> str:
def original_file_path(self) -> str:
return os.path.join(self.searched_path, self.relative_path)

def seed_too_large(self) -> bool:
"""Return whether the file this represents is over the seed size limit"""
return os.stat(self.full_path).st_size > MAXIMUM_SEED_SIZE
def file_size(self) -> int:
return os.stat(self.full_path).st_size

Check warning on line 68 in core/dbt/contracts/files.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/contracts/files.py#L68

Added line #L68 was not covered by tests


@dataclass
Expand Down
1 change: 1 addition & 0 deletions core/dbt/contracts/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ class ProjectFlags(ExtensibleDbtClassMixin):
log_format_file: Optional[str] = None
log_level: Optional[str] = None
log_level_file: Optional[str] = None
maximum_seed_size_mib: Optional[int] = None
partial_parse: Optional[bool] = None
populate_cache: Optional[bool] = None
printer_width: Optional[int] = None
Expand Down
17 changes: 11 additions & 6 deletions core/dbt/events/types.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import json

from dbt.constants import MAXIMUM_SEED_SIZE_NAME, PIN_PACKAGE_URL
from dbt.constants import PIN_PACKAGE_URL
from dbt.events.base_types import (
DebugLevel,
DynamicLevel,
ErrorLevel,
InfoLevel,
WarnLevel,
)
from dbt.flags import get_flags
from dbt_common.events.base_types import EventLevel
from dbt_common.events.format import (
format_fancy_output_line,
Expand Down Expand Up @@ -675,10 +676,11 @@ def code(self) -> str:
return "I052"

def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was "
f"<={MAXIMUM_SEED_SIZE_NAME}, so it has changed"
f">{maximum_seed_size_name} in size. The previous file was "
f"<={maximum_seed_size_name}, so it has changed"
)
return msg

Expand All @@ -688,9 +690,10 @@ def code(self) -> str:
return "I053"

def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size at the same path, dbt "
f">{maximum_seed_size_name} in size at the same path, dbt "
f"cannot tell if it has changed: assuming they are the same"
)
return msg
Expand All @@ -701,9 +704,10 @@ def code(self) -> str:
return "I054"

def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was in "
f">{maximum_seed_size_name} in size. The previous file was in "
f"a different location, assuming it has changed"
)
return msg
Expand All @@ -714,9 +718,10 @@ def code(self) -> str:
return "I055"

def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file had a "
f">{maximum_seed_size_name} in size. The previous file had a "
f"checksum type of {self.checksum_name}, so it has changed"
)
return msg
Expand Down
1 change: 1 addition & 0 deletions core/dbt/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def get_flag_dict():
"log_path",
"invocation_command",
"empty",
"maximum_seed_size_mib",
}
return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr}

Expand Down
9 changes: 6 additions & 3 deletions core/dbt/parser/read_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from dbt.events.types import InputFileDiffError
from dbt.exceptions import ParsingError
from dbt.flags import get_flags
from dbt.parser.common import schema_file_keys
from dbt.parser.schemas import yaml_from_file
from dbt.parser.search import filesystem_search
Expand Down Expand Up @@ -123,12 +124,14 @@ def validate_yaml(file_path, dct):

# Special processing for big seed files
def load_seed_source_file(match: FilePath, project_name) -> SourceFile:
if match.seed_too_large():
# Users can configure the maximum seed size (MiB) that will be hashed for state comparison
maximum_seed_size = get_flags().MAXIMUM_SEED_SIZE_MIB * 1024 * 1024

Check warning on line 128 in core/dbt/parser/read_files.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/parser/read_files.py#L128

Added line #L128 was not covered by tests
# maximum_seed_size = 0 means no limit
if match.file_size() > maximum_seed_size and maximum_seed_size != 0:

Check warning on line 130 in core/dbt/parser/read_files.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/parser/read_files.py#L130

Added line #L130 was not covered by tests
# We don't want to calculate a hash of this file. Use the path.
source_file = SourceFile.big_seed(match)
else:
file_contents = load_file_contents(match.absolute_path, strip=True)
checksum = FileHash.from_contents(file_contents)
checksum = FileHash.from_path(match.absolute_path)

Check warning on line 134 in core/dbt/parser/read_files.py

View check run for this annotation

Codecov / codecov/patch

core/dbt/parser/read_files.py#L134

Added line #L134 was not covered by tests
source_file = SourceFile(path=match, checksum=checksum)
source_file.contents = ""
source_file.parse_file_type = ParseFileType.Seed
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/graph/test_selector_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedExceedsLimitSamePath"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
assert not search_manifest_using_method(manifest, method, "new")
warn_or_error_patch.assert_not_called()
Expand All @@ -793,7 +793,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedExceedsLimitSamePath"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")


def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state, seed):
Expand All @@ -807,7 +807,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedIncreased"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
assert not search_manifest_using_method(manifest, method, "new")
warn_or_error_patch.assert_not_called()
Expand All @@ -820,7 +820,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedIncreased"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")


def test_select_state_changed_seed_checksum_path_to_sha(manifest, previous_state, seed):
Expand Down

0 comments on commit 704120c

Please sign in to comment.