-
Notifications
You must be signed in to change notification settings - Fork 1
Feat/db optimizations (STIT-502) #135
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
AlexAxthelm
wants to merge
10
commits into
main
Choose a base branch
from
feat/db-optimizations
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
415070f
refactor(db): Use dense entity-value-attribute model
AlexAxthelm 583dbaa
Remove pg_trgm for now
AlexAxthelm eddeb80
Merge branch 'main' into feat/db-optimizations
AlexAxthelm 887714b
style: no inner parens
AlexAxthelm fa6dc01
rename vars and update comments
AlexAxthelm ea47e94
res tore source == source condition in pivot
AlexAxthelm 5d94cad
Deterministic tie break
AlexAxthelm 5f28abb
autoincrement PK
AlexAxthelm 903f66c
Raise error, not assert
AlexAxthelm 03a4fb6
Merge branch 'main' into feat/db-optimizations
AlexAxthelm File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,22 +1,31 @@ | ||
| """baseline | ||
|
|
||
| Revision ID: 6de2b873bacb | ||
| Revision ID: f3fb36006ce6 | ||
| Revises: | ||
| Create Date: 2026-06-04 12:35:31.176312 | ||
| Create Date: 2026-06-17 19:08:25.103926 | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from alembic import op | ||
| import sqlalchemy as sa | ||
| from sqlalchemy.dialects import postgresql | ||
|
|
||
| import stitch.api.db.model.types | ||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "6de2b873bacb" | ||
| revision = "f3fb36006ce6" | ||
| down_revision = None | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
| DEFAULT_PRIORITIES = [ | ||
| {"source": "rmi", "priority": 1}, | ||
| {"source": "gem", "priority": 2}, | ||
| {"source": "wm", "priority": 3}, | ||
| {"source": "llm", "priority": 4}, | ||
| ] | ||
|
|
||
|
|
||
| def upgrade() -> None: | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
|
|
@@ -33,12 +42,7 @@ def upgrade() -> None: | |
| sa.column("source", sa.String), | ||
| sa.column("priority", sa.Integer), | ||
| ), | ||
| [ | ||
| {"source": "rmi", "priority": 1}, | ||
| {"source": "wm", "priority": 2}, | ||
| {"source": "gem", "priority": 3}, | ||
| {"source": "llm", "priority": 4}, | ||
| ], | ||
| DEFAULT_PRIORITIES, | ||
| ) | ||
| op.create_table( | ||
| "users", | ||
|
|
@@ -112,57 +116,8 @@ def upgrade() -> None: | |
| sa.Enum("gem", "wm", "rmi", "llm", native_enum=False), | ||
| nullable=False, | ||
| ), | ||
| sa.Column("owners", sa.JSON(), nullable=True), | ||
| sa.Column("operators", sa.JSON(), nullable=True), | ||
| sa.Column("source_record", sa.JSON(), nullable=False), | ||
| sa.Column("name", sa.String(), nullable=True), | ||
| sa.Column("country", sa.String(), nullable=True), | ||
| sa.Column("name_local", sa.String(), nullable=True), | ||
| sa.Column("state_province", sa.String(), nullable=True), | ||
| sa.Column("region", sa.String(), nullable=True), | ||
| sa.Column("basin", sa.String(), nullable=True), | ||
| sa.Column("reservoir_formation", sa.String(), nullable=True), | ||
| sa.Column("latitude", sa.Float(), nullable=True), | ||
| sa.Column("longitude", sa.Float(), nullable=True), | ||
| sa.Column("discovery_year", sa.Integer(), nullable=True), | ||
| sa.Column("production_start_year", sa.Integer(), nullable=True), | ||
| sa.Column("fid_year", sa.Integer(), nullable=True), | ||
| sa.Column( | ||
| "location_type", | ||
| sa.Enum("Onshore", "Offshore", "Unknown", native_enum=False), | ||
| nullable=True, | ||
| ), | ||
| sa.Column( | ||
| "production_conventionality", | ||
| sa.Enum( | ||
| "Conventional", "Unconventional", "Mixed", "Unknown", native_enum=False | ||
| ), | ||
| nullable=True, | ||
| ), | ||
| sa.Column( | ||
| "primary_hydrocarbon_group", | ||
| sa.Enum( | ||
| "Ultra-Light Oil", | ||
| "Light Oil", | ||
| "Medium Oil", | ||
| "Heavy Oil", | ||
| "Extra-Heavy Oil", | ||
| "Dry Gas", | ||
| "Wet Gas", | ||
| "Acid Gas", | ||
| "Condensate", | ||
| "Mixed", | ||
| "Unknown", | ||
| native_enum=False, | ||
| ), | ||
| nullable=True, | ||
| ), | ||
| sa.Column( | ||
| "field_status", | ||
| sa.Enum( | ||
| "Producing", "Non-Producing", "Abandoned", "Planned", native_enum=False | ||
| ), | ||
| nullable=True, | ||
| "source_record", stitch.api.db.model.types.StitchJson(), nullable=False | ||
| ), | ||
| sa.Column( | ||
| "created", | ||
|
|
@@ -309,6 +264,83 @@ def upgrade() -> None: | |
| ), | ||
| sa.PrimaryKeyConstraint("id"), | ||
| ) | ||
| op.create_table( | ||
| "og_field_resource_source_priority", | ||
| sa.Column( | ||
| "resource_id", | ||
| sa.BigInteger() | ||
| .with_variant(sa.BIGINT(), "postgresql") | ||
| .with_variant(sa.INTEGER(), "sqlite"), | ||
| nullable=False, | ||
| ), | ||
| sa.Column("source", sa.String(length=10), nullable=False), | ||
| sa.Column("priority", sa.Integer(), nullable=False), | ||
| sa.ForeignKeyConstraint( | ||
| ["resource_id"], ["og_field_resources.id"], ondelete="CASCADE" | ||
| ), | ||
| sa.ForeignKeyConstraint( | ||
| ["source"], | ||
| ["og_field_source_priority.source"], | ||
| ), | ||
| sa.PrimaryKeyConstraint("resource_id", "source"), | ||
| ) | ||
| op.create_table( | ||
| "oil_gas_field_source_values", | ||
| sa.Column( | ||
| "id", | ||
| sa.BigInteger() | ||
| .with_variant(sa.BIGINT(), "postgresql") | ||
| .with_variant(sa.INTEGER(), "sqlite"), | ||
| autoincrement=True, | ||
| nullable=False, | ||
| ), | ||
| sa.Column( | ||
| "source_pk", | ||
| sa.BigInteger() | ||
| .with_variant(sa.BIGINT(), "postgresql") | ||
| .with_variant(sa.INTEGER(), "sqlite"), | ||
| nullable=False, | ||
| ), | ||
| sa.Column("colname", sa.String(length=50), nullable=False), | ||
| sa.Column("value_text", sa.String(), nullable=True), | ||
| sa.Column( | ||
| "value_num", | ||
| sa.Float().with_variant(sa.DOUBLE_PRECISION(), "postgresql"), | ||
| nullable=True, | ||
| ), | ||
| sa.Column( | ||
| "value_json", | ||
| sa.JSON(none_as_null=True).with_variant( | ||
| postgresql.JSONB(none_as_null=True, astext_type=sa.Text()), "postgresql" | ||
| ), | ||
| nullable=True, | ||
| ), | ||
| sa.CheckConstraint( | ||
| "colname IN ('name', 'country', 'name_local', 'state_province', 'region', 'basin', 'reservoir_formation', 'location_type', 'production_conventionality', 'primary_hydrocarbon_group', 'field_status', 'latitude', 'longitude', 'discovery_year', 'production_start_year', 'fid_year', 'owners', 'operators')", | ||
| name="ck_source_value_colname", | ||
|
Comment on lines
+318
to
+320
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good constraints to add, but could present some difficulties/gotchas if the model shifts. Mainly requires refreshing/recreating the constraint in subsequent migrations–looks like it could be easy to miss. |
||
| ), | ||
| sa.CheckConstraint( | ||
| "(CASE WHEN value_text IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN value_num IS NOT NULL THEN 1 ELSE 0 END + CASE WHEN value_json IS NOT NULL THEN 1 ELSE 0 END) = 1", | ||
| name="ck_source_value_exactly_one", | ||
| ), | ||
| sa.ForeignKeyConstraint( | ||
| ["source_pk"], ["oil_gas_field_sources.id"], ondelete="CASCADE" | ||
| ), | ||
| sa.PrimaryKeyConstraint("id"), | ||
| sa.UniqueConstraint("source_pk", "colname", name="uq_source_value_colname"), | ||
| ) | ||
| op.create_index( | ||
| "ix_source_value_colname_num", | ||
| "oil_gas_field_source_values", | ||
| ["colname", "value_num"], | ||
| unique=False, | ||
| ) | ||
| op.create_index( | ||
| "ix_source_value_colname_text", | ||
| "oil_gas_field_source_values", | ||
| ["colname", "value_text"], | ||
| unique=False, | ||
| ) | ||
| op.create_table( | ||
| "merge_candidate_items", | ||
| sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), | ||
|
|
@@ -341,6 +373,11 @@ def upgrade() -> None: | |
| ), | ||
| ) | ||
| # ### end Alembic commands ### | ||
| # | ||
| # NOTE: substring search (ILIKE '%term%') currently relies on standard text | ||
| # matching backed by the (colname, value_text) B-tree index -- no trigram | ||
| # acceleration, to avoid requiring the pg_trgm extension. See the deferred | ||
| # follow-up for adding a pg_trgm GIN index if substring search gets slow. | ||
|
|
||
|
|
||
| def downgrade() -> None: | ||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we add a
source_idcolumn here (or replacesourcew/source_id), then we can embed the default priorities in code, and when a resource is created/updated we then update the priority table.This would mean that all top level resources would have at least 1 row in the
og_field_resource_source_prioritytable, and we could drop the static source priority table. It adds some complexity but also sets the groundwork for user-mediated priorities.I guess it seems like if we're going to undertake the effort to allow for priority overrides, getting more granular at the source id level gets us more flexibility to pick specific source rows where multiple source keys are present.