From a3803e017a26bba9158deaeaec6cb994104b0013 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 07:55:29 +0000 Subject: [PATCH 01/16] Update twine requirement from <6,>=4 to >=4,<7 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b4bc1f2..bf34266 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,7 +87,7 @@ dev = [ "black>=22,<25", # A deterministic code formatter # "label-studio>=1.12,<2.0", # Tool for labeling training data, exclude until pydantic upgrade "tox>=4,<5", # Python test environment manager - "twine>=4,<6", # Used to make releases to PyPI + "twine>=4,<7", # Used to make releases to PyPI ] docs = [ "doc8>=1,<2", # Ensures clean documentation formatting From 38f25606eb4389638561be4c4efd74ec4d8a8336 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 22:22:05 +0000 Subject: [PATCH 02/16] [pre-commit.ci] pre-commit autoupdate --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2aaf16a..9165ed0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: # Formatters: hooks that re-write Python and RST files ######################################################################################## - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.0 + rev: v0.8.1 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] From c3979410bd7130995281c9cb5650055632c2e069 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 6 Dec 2024 08:04:54 +0000 Subject: [PATCH 03/16] Update transformers requirement --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bf34266..73ba731 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "sqlalchemy>=2,<3", "timm>0.9,<2", # dependency for Hugging Face computer vision models "torch>=2.2,<3", - "transformers>=4.42.3,<=4.46.3", + "transformers>=4.42.3,<=4.47.0", "xhtml2pdf", # Convert html to PDF files ] classifiers = [ From c3a99ede3d78cb89d72f79bc909791b888f698c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 22:04:10 +0000 Subject: [PATCH 04/16] [pre-commit.ci] pre-commit autoupdate --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9165ed0..05e8c31 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: # Formatters: hooks that re-write Python and RST files ######################################################################################## - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.1 + rev: v0.8.2 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] From 86098d8337372b8253e31183698b576422065ff6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 21:57:52 +0000 Subject: [PATCH 05/16] [pre-commit.ci] pre-commit autoupdate --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 05e8c31..22d305e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: # Formatters: hooks that re-write Python and RST files ######################################################################################## - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.2 + rev: v0.8.3 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] From 322f62416087f2872f1e636860566c483dd25e84 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 18 Dec 2024 08:07:35 +0000 Subject: [PATCH 06/16] Update transformers requirement --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 73ba731..f8223b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "sqlalchemy>=2,<3", "timm>0.9,<2", # dependency for Hugging Face computer vision models "torch>=2.2,<3", - "transformers>=4.42.3,<=4.47.0", + "transformers>=4.42.3,<=4.47.1", "xhtml2pdf", # Convert html to PDF files ] classifiers = [ From 1e82f37562e9dd8cda4e03dc7d393539244c3f13 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 07:58:32 +0000 Subject: [PATCH 07/16] Update mypy requirement from <1.14,>=1.0 to >=1.0,<1.15 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f8223b6..d47da69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ tests = [ "coverage>=7,<8", # Lets us track what code is being tested "exceptiongroup>=1,<2", "jupyter", # For integration testing Jupyter notebooks - "mypy>=1.0,<1.14", # Static type checking + "mypy>=1.0,<1.15", # Static type checking "nbconvert>=7,<8", "nbformat>=5,<6", "pre-commit>=3,<5", # Allow us to run pre-commit hooks in testing From ba8700f580ce0a1f5b965876fa51de7745268d2c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 22:12:57 +0000 Subject: [PATCH 08/16] [pre-commit.ci] pre-commit autoupdate --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22d305e..a2ee500 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: # Formatters: hooks that re-write Python and RST files ######################################################################################## - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.3 + rev: v0.8.4 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] From d0f7d86a6d4e0a5672eada74fbc50a6426d8bbd1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 Jan 2025 22:51:44 +0000 Subject: [PATCH 09/16] [pre-commit.ci] pre-commit autoupdate --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a2ee500..4d1c10d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: # Formatters: hooks that re-write Python and RST files ######################################################################################## - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.4 + rev: v0.8.6 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] From ce8b1097fb312447cca6683a3e6406a294929352 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 15 Jan 2025 12:11:28 -0500 Subject: [PATCH 10/16] Hook up final splink notebook asset --- .../models/sec_eia_record_linkage/__init__.py | 10 +- .../notebooks/splink-sec-eia.ipynb | 3804 +++++++++++++++++ .../transform_sec_input.py | 28 +- 3 files changed, 3831 insertions(+), 11 deletions(-) create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/notebooks/splink-sec-eia.ipynb diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py index 3350449..64ed894 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py @@ -30,11 +30,9 @@ eia_assets = load_assets_from_modules([transform_eia_input]) sec_assets = load_assets_from_modules([transform_sec_input]) -eia_input_table_production_job = model_jobs.create_production_model_job( - "eia_input_table_creation", transform_eia_input.production_assets -) -sec_input_table_production_job = model_jobs.create_production_model_job( - "sec_input_table_creation", transform_sec_input.production_assets +record_linkage_job = model_jobs.create_production_model_job( + "sec_eia_record_linkage", + transform_eia_input.production_assets + transform_sec_input.production_assets, ) # Create year_quarter partitions @@ -63,7 +61,7 @@ sec_assets + eia_assets + [basic_10k_company_info, ex21_company_ownership_info, sec10k_filing_metadata], - jobs=[eia_input_table_production_job, sec_input_table_production_job], + jobs=[record_linkage_job], resources={ "cloud_interface": cloud_interface_resource, "mlflow_interface": mlflow_interface_resource, diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/notebooks/splink-sec-eia.ipynb b/src/mozilla_sec_eia/models/sec_eia_record_linkage/notebooks/splink-sec-eia.ipynb new file mode 100644 index 0000000..1263b43 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/notebooks/splink-sec-eia.ipynb @@ -0,0 +1,3804 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9b8224d4-7596-45b7-bfb5-028f29a96f3d", + "metadata": {}, + "source": [ + "# Load upstream assets\n", + "The tables `core_eia__parents_and_subsidiaries` and `_core_sec_10k__filers` are input assets to the record linkage process. This notebook will produce a table called `core_sec_10k__filers`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e136eeca-133b-4f61-8e37-5e1ea8c99683", + "metadata": {}, + "outputs": [], + "source": [ + "import dagstermill" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cad4e652-441c-48ec-b1d2-c7627dd566d3", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2025-01-15 10:01:12 -0500 - dagster - DEBUG - system - Loading file from: gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet using PandasParquetIOManager...\n", + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2025-01-15 10:01:14 -0500 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/transformed_basic_10k using PickledObjectFilesystemIOManager...\n" + ] + } + ], + "source": [ + "from mozilla_sec_eia.models.sec_eia_record_linkage import defs\n", + "\n", + "clean_eia_df = defs.load_asset_value(\"core_eia__parents_and_subsidiaries\")\n", + "clean_basic_10k_df = defs.load_asset_value(\"transformed_basic_10k\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1107fe42-197c-4fea-9c48-06d08699af0b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.metrics import (\n", + " accuracy_score,\n", + " confusion_matrix,\n", + " precision_score,\n", + " recall_score,\n", + ")\n", + "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n", + "from splink.blocking_analysis import (\n", + " count_comparisons_from_blocking_rule,\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + " n_largest_blocks,\n", + ")\n", + "from splink.exploratory import completeness_chart, profile_columns\n", + "\n", + "from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (\n", + " BLOCKING_RULES,\n", + " MATCH_COLS,\n", + " SHARED_COLS,\n", + " address_comparison,\n", + " city_comparison,\n", + " company_name_comparison,\n", + " deterministic_blocking_rules,\n", + " state_comparison,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db", + "metadata": {}, + "source": [ + "# Preprocess SEC and EIA\n", + "\n", + "Does it make more sense to do a direct match on company name after\n", + "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7d2d103a-2bbd-4974-b770-44626bdc5111", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = clean_basic_10k_df[clean_basic_10k_df.files_10k][SHARED_COLS]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27", + "metadata": {}, + "outputs": [], + "source": [ + "eia_match_df = clean_eia_df[SHARED_COLS]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e754b2ef-5a0d-4582-8694-047528dfd339", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_match_df.record_id.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "38ad3504-2cde-455f-8896-6a435677541c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eia_match_df.record_id.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "856c14d8-3250-4650-a2db-3808b4718f19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair\n", + "clean_basic_10k_df.sec_company_id.is_unique" + ] + }, + { + "cell_type": "markdown", + "id": "b18fef7e-c316-4c90-b2bc-04706401135e", + "metadata": {}, + "source": [ + "There should probably be no duplicate record, but if there are, keep the most recent version of that record." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "842fa02e-5202-445c-b728-72bce42e740d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 20821\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eia_match_df.duplicated(subset=MATCH_COLS).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b53e6244-f0ca-4256-bc09-9c3264675389", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 61026\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_match_df.duplicated(subset=MATCH_COLS).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")\n", + "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")" + ] + }, + { + "cell_type": "markdown", + "id": "46d967d4-3722-437d-b2f0-37cbac17624f", + "metadata": {}, + "source": [ + "# Link SEC and EIA" + ] + }, + { + "cell_type": "markdown", + "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f", + "metadata": {}, + "source": [ + "## Exploratory Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05", + "metadata": {}, + "outputs": [], + "source": [ + "db_api = DuckDBAPI()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4bab1568-6a55-427c-9a78-e44db8b0584d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completeness_chart(sec_match_df, db_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6b9479e3-e836-4407-a2b6-926c185065a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completeness_chart(eia_match_df, db_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(sec_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(eia_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "69f5fc54-f479-495c-86fc-48accda883d0", + "metadata": {}, + "source": [ + "## Blocking" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'number_of_comparisons_generated_pre_filter_conditions': 487944,\n", + " 'number_of_comparisons_to_be_scored_post_filter_conditions': 487944,\n", + " 'filter_conditions_identified': '',\n", + " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n", + " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# useful for experimenting with a new blocking rule\n", + "counts = count_comparisons_from_blocking_rule(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rule=BLOCKING_RULES[0],\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"record_id\",\n", + " db_api=db_api,\n", + ")\n", + "\n", + "counts" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
key_0count_lcount_rblock_count
0INTR4457633820
1AMRK8513832338
2FRST8163629376
\n", + "
" + ], + "text/plain": [ + " key_0 count_l count_r block_count\n", + "0 INTR 445 76 33820\n", + "1 AMRK 851 38 32338\n", + "2 FRST 816 36 29376" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = n_largest_blocks(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rule=BLOCKING_RULES[0],\n", + " link_type=\"link_only\",\n", + " db_api=db_api,\n", + " n_largest=3\n", + ")\n", + "\n", + "result.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4e1a9844-5d98-4cac-a083-eef134f083ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rules=BLOCKING_RULES,\n", + " db_api=db_api,\n", + " unique_id_column_name=\"record_id\",\n", + " link_type=\"link_only\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "377b0017-e46f-4d06-8cb5-af2b7725fc0e", + "metadata": {}, + "source": [ + "## Create Model" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'NameComparison' of \"company_name_no_legal\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'company_name_no_legal is NULL' with SQL rule: \"company_name_no_legal_l\" IS NULL OR \"company_name_no_legal_r\" IS NULL\n", + " - 'Exact match on company_name_no_legal' with SQL rule: \"company_name_no_legal_l\" = \"company_name_no_legal_r\"\n", + " - 'Jaro-Winkler distance of company_name_no_legal >= 0.95' with SQL rule: jaro_winkler_similarity(\"company_name_no_legal_l\", \"company_name_no_legal_r\") >= 0.95\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4298a288-c306-4d75-9d72-e5b8f87774ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'LevenshteinAtThresholds' of \"street_address\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'street_address is NULL' with SQL rule: \"street_address_l\" IS NULL OR \"street_address_r\" IS NULL\n", + " - 'Exact match on street_address' with SQL rule: \"street_address_l\" = \"street_address_r\"\n", + " - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 1\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "afdd5872-bc29-406f-bd0a-d5f4436f6794", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'ExactMatch' of \"state\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'state is NULL' with SQL rule: \"state_l\" IS NULL OR \"state_r\" IS NULL\n", + " - 'Exact match on state' with SQL rule: \"state_l\" = \"state_r\"\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "print(state_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "90596d17-edb4-4ed1-9306-ea6c33ad00c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'NameComparison' of \"city\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'city is NULL' with SQL rule: \"city_l\" IS NULL OR \"city_r\" IS NULL\n", + " - 'Exact match on city' with SQL rule: \"city_l\" = \"city_r\"\n", + " - 'Jaro-Winkler distance of city >= 0.9' with SQL rule: jaro_winkler_similarity(\"city_l\", \"city_r\") >= 0.9\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407", + "metadata": {}, + "outputs": [], + "source": [ + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"record_id\",\n", + " comparisons=[\n", + " company_name_comparison,\n", + " address_comparison,\n", + " state_comparison,\n", + " city_comparison\n", + " ],\n", + " blocking_rules_to_generate_predictions=BLOCKING_RULES,\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "36cae876-783d-4bff-89df-9d30cc5e60d6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 2.37e-06.\n", + "This means that amongst all possible pairwise record comparisons, one in 421,176.28 are expected to match. With 1,270,622,346 total possible comparisons, we expect a total of around 3,016.84 matching pairs\n" + ] + } + ], + "source": [ + "linker.training.estimate_probability_two_random_records_match(deterministic_blocking_rules, recall=0.95)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "633e259501c64a8a948349d6e585b092", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9994447a3ef04fb190cf0a135bba903a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name_no_legal (no m values are trained).\n", + " - street_address (no m values are trained).\n", + " - state (no m values are trained).\n", + " - city (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e8)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"company_name\" = r.\"company_name\") AND (l.\"company_name\" = r.\"company_name\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - company_name_no_legal\n", + " - street_address\n", + " - state\n", + " - city\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + "\n", + "WARNING:\n", + "Level Jaro-Winkler distance of company_name_no_legal >= 0.95 on comparison company_name_no_legal not observed in dataset, unable to train m value\n", + "\n", + "WARNING:\n", + "Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value\n", + "\n", + "Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`\n", + "Iteration 2: Largest change in params was 0.284 in probability_two_random_records_match\n", + "Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match\n", + "Iteration 4: Largest change in params was 0.000534 in probability_two_random_records_match\n", + "Iteration 5: Largest change in params was 1.09e-07 in probability_two_random_records_match\n", + "\n", + "EM converged after 5 iterations\n", + "m probability not trained for company_name_no_legal - Jaro-Winkler distance of company_name_no_legal >= 0.95 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n", + "m probability not trained for company_name_no_legal - All other comparisons (comparison vector value: 0). This usually means the comparison level was never observed in the training data.\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name_no_legal (some m values are not trained).\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"company_name\", \"company_name\")\n", + "training_session_fname_sname = (\n", + " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "9581aa18-3352-429a-86c4-6078bcf13a55", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"street_address\" = r.\"street_address\") AND (l.\"street_address\" = r.\"street_address\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - company_name_no_legal\n", + " - state\n", + " - city\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - street_address\n", + "\n", + "Iteration 1: Largest change in params was -0.969 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`\n", + "Iteration 2: Largest change in params was 0.457 in probability_two_random_records_match\n", + "Iteration 3: Largest change in params was 0.041 in probability_two_random_records_match\n", + "Iteration 4: Largest change in params was 0.0429 in the m_probability of city, level `All other comparisons`\n", + "Iteration 5: Largest change in params was 0.0184 in probability_two_random_records_match\n", + "Iteration 6: Largest change in params was 0.00693 in probability_two_random_records_match\n", + "Iteration 7: Largest change in params was 0.0026 in probability_two_random_records_match\n", + "Iteration 8: Largest change in params was 0.00099 in probability_two_random_records_match\n", + "Iteration 9: Largest change in params was 0.000379 in probability_two_random_records_match\n", + "Iteration 10: Largest change in params was 0.000145 in probability_two_random_records_match\n", + "Iteration 11: Largest change in params was 5.59e-05 in probability_two_random_records_match\n", + "\n", + "EM converged after 11 iterations\n", + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"street_address\", \"street_address\")\n", + "training_session_fname_sname = (\n", + " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "61298aa2-dbd4-4f2a-9c25-5f831d226d13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.m_u_parameters_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91", + "metadata": {}, + "outputs": [], + "source": [ + "# you could save the model weights like this\n", + "settings = linker.misc.save_model_to_json(\n", + " \"model_unsupervised_0.json\", overwrite=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b", + "metadata": {}, + "source": [ + "## Make Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "94e96441-89b6-4516-aa6a-4d1593ce03be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Blocking time: 0.22 seconds\n", + "Predict time: 0.40 seconds\n" + ] + } + ], + "source": [ + "df_predictions = linker.inference.predict()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0", + "metadata": {}, + "outputs": [], + "source": [ + "preds_df = df_predictions.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rgamma_street_addresstf_street_address_ltf_street_address_rbf_street_addressbf_tf_adj_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_key
463293-22.9671561.219543e-07__splink__input_table_0__splink__input_table_15942320092west coast entertainmentwest line solar00.0000240.0000120.9860461.000000po box 14002180 south 1300 east00.0000240.0001100.8816571.000000paut00.0294210.0105490.1990221.000000langhornesalt lake city00.0001830.0057720.2968661.000000WST KST ENTRTNMNTWST LN SLR0
296844-22.9671561.219543e-07__splink__input_table_0__splink__input_table_13682612221morgan j p andmorgantown energy associates00.0000120.0000120.9860461.00000060 wall st555 beechurst ave00.0008430.0000120.8816571.000000nywv00.1201540.0017700.1990221.000000new yorkmorgantown00.0907950.0000860.2968661.000000MRKN J P ANTMRKNTN ENRJ ASXTS0
296843-22.9671561.219543e-07__splink__input_table_0__splink__input_table_13682512221morgan groupmorgantown energy associates00.0000120.0000120.9860461.0000002746 old u s 20 w555 beechurst ave00.0000120.0000120.8816571.000000inwv00.0082990.0017700.1990221.000000elkhartmorgantown00.0002200.0000860.2968661.000000MRKN KRPMRKNTN ENRJ ASXTS0
296842-22.9671561.219543e-07__splink__input_table_0__splink__input_table_13682412221morgan group holdingmorgantown energy associates00.0000120.0000120.9860461.000000401 theodore fremd ave555 beechurst ave00.0000240.0000120.8816571.000000nywv00.1201540.0017700.1990221.000000ryemorgantown00.0002930.0000860.2968661.000000MRKN KRP HLTNKMRKNTN ENRJ ASXTS0
296841-22.9671561.219543e-07__splink__input_table_0__splink__input_table_13682312221morgan creek energymorgantown energy associates00.0000240.0000120.9860461.000000quorum555 beechurst ave00.0000120.0000120.8816571.000000txwv00.0798410.0017700.1990221.000000dallasmorgantown00.0138420.0000860.2968661.000000MRKN KRK ENRJMRKNTN ENRJ ASXTS0
..................................................................................................................
30412527.5193321.000000e+00__splink__input_table_0__splink__input_table_13981613109northwestern public servicenorthwestern public service20.0000730.000073416213.6769190.01657833 third st se33 third st se20.0000370.00003711404.7441470.262779sdsd10.0019300.00193015.43227927.240566huronhuron20.0000730.00007397.46483395.630090NR0WSTRN PBLK SRFSNR0WSTRN PBLK SRFS0
21455227.5378291.000000e+00__splink__input_table_0__splink__input_table_1246508047green mountain powergreen mountain power20.0000370.000037416213.6769190.033156163 acorn ln163 acorn ln20.0000370.00003711404.7441470.262779vtvt10.0015250.00152515.43227934.490071colchestercolchester20.0001830.00018397.46483338.252036KRN MNTN PWRKRN MNTN PWR0
43782327.7590731.000000e+00__splink__input_table_0__splink__input_table_15884219906wausau paper millswausau paper mills20.0000240.000024416213.6769190.049733one clarks isone clarks is20.0000240.00002411404.7441470.394168wiwi10.0088270.00882715.4322795.956503wausauwausau20.0000610.00006197.464833114.756107WS PPR MLSWS PPR MLS0
38148227.8893611.000000e+00__splink__input_table_0__splink__input_table_15156717450st joseph light and powerst joseph light and power20.0000240.000024416213.6769190.049733520 francis st520 francis st20.0000240.00002411404.7441470.394168momo10.0100820.01008215.4322795.215572st josephst joseph20.0000490.00004997.464833143.445134ST JSF LT ANT PWRST JSF LT ANT PWR0
13731529.2223271.000000e+00__splink__input_table_0__splink__input_table_1205886741fibermarkfibermark20.0000370.000037416213.6769190.033156161 wellington rd161 wellington rd20.0000240.00002411404.7441470.394168vtvt10.0015250.00152515.43227934.490071brattleborobrattleboro20.0000860.00008697.46483381.968648FBRMRKFBRMRK0
\n", + "

590549 rows × 37 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", + "463293 -22.967156 1.219543e-07 __splink__input_table_0 __splink__input_table_1 59423 20092 west coast entertainment west line solar 0 0.000024 0.000012 0.986046 1.000000 po box 1400 2180 south 1300 east 0 0.000024 0.000110 0.881657 1.000000 pa ut 0 0.029421 0.010549 0.199022 1.000000 langhorne salt lake city 0 0.000183 0.005772 0.296866 1.000000 WST KST ENTRTNMNT WST LN SLR 0\n", + "296844 -22.967156 1.219543e-07 __splink__input_table_0 __splink__input_table_1 36826 12221 morgan j p and morgantown energy associates 0 0.000012 0.000012 0.986046 1.000000 60 wall st 555 beechurst ave 0 0.000843 0.000012 0.881657 1.000000 ny wv 0 0.120154 0.001770 0.199022 1.000000 new york morgantown 0 0.090795 0.000086 0.296866 1.000000 MRKN J P ANT MRKNTN ENRJ ASXTS 0\n", + "296843 -22.967156 1.219543e-07 __splink__input_table_0 __splink__input_table_1 36825 12221 morgan group morgantown energy associates 0 0.000012 0.000012 0.986046 1.000000 2746 old u s 20 w 555 beechurst ave 0 0.000012 0.000012 0.881657 1.000000 in wv 0 0.008299 0.001770 0.199022 1.000000 elkhart morgantown 0 0.000220 0.000086 0.296866 1.000000 MRKN KRP MRKNTN ENRJ ASXTS 0\n", + "296842 -22.967156 1.219543e-07 __splink__input_table_0 __splink__input_table_1 36824 12221 morgan group holding morgantown energy associates 0 0.000012 0.000012 0.986046 1.000000 401 theodore fremd ave 555 beechurst ave 0 0.000024 0.000012 0.881657 1.000000 ny wv 0 0.120154 0.001770 0.199022 1.000000 rye morgantown 0 0.000293 0.000086 0.296866 1.000000 MRKN KRP HLTNK MRKNTN ENRJ ASXTS 0\n", + "296841 -22.967156 1.219543e-07 __splink__input_table_0 __splink__input_table_1 36823 12221 morgan creek energy morgantown energy associates 0 0.000024 0.000012 0.986046 1.000000 quorum 555 beechurst ave 0 0.000012 0.000012 0.881657 1.000000 tx wv 0 0.079841 0.001770 0.199022 1.000000 dallas morgantown 0 0.013842 0.000086 0.296866 1.000000 MRKN KRK ENRJ MRKNTN ENRJ ASXTS 0\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "304125 27.519332 1.000000e+00 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 416213.676919 0.016578 33 third st se 33 third st se 2 0.000037 0.000037 11404.744147 0.262779 sd sd 1 0.001930 0.001930 15.432279 27.240566 huron huron 2 0.000073 0.000073 97.464833 95.630090 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0\n", + "214552 27.537829 1.000000e+00 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 416213.676919 0.033156 163 acorn ln 163 acorn ln 2 0.000037 0.000037 11404.744147 0.262779 vt vt 1 0.001525 0.001525 15.432279 34.490071 colchester colchester 2 0.000183 0.000183 97.464833 38.252036 KRN MNTN PWR KRN MNTN PWR 0\n", + "437823 27.759073 1.000000e+00 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 416213.676919 0.049733 one clarks is one clarks is 2 0.000024 0.000024 11404.744147 0.394168 wi wi 1 0.008827 0.008827 15.432279 5.956503 wausau wausau 2 0.000061 0.000061 97.464833 114.756107 WS PPR MLS WS PPR MLS 0\n", + "381482 27.889361 1.000000e+00 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 416213.676919 0.049733 520 francis st 520 francis st 2 0.000024 0.000024 11404.744147 0.394168 mo mo 1 0.010082 0.010082 15.432279 5.215572 st joseph st joseph 2 0.000049 0.000049 97.464833 143.445134 ST JSF LT ANT PWR ST JSF LT ANT PWR 0\n", + "137315 29.222327 1.000000e+00 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 416213.676919 0.033156 161 wellington rd 161 wellington rd 2 0.000024 0.000024 11404.744147 0.394168 vt vt 1 0.001525 0.001525 15.432279 34.490071 brattleboro brattleboro 2 0.000086 0.000086 97.464833 81.968648 FBRMRK FBRMRK 0\n", + "\n", + "[590549 rows x 37 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df.sort_values(by=\"match_probability\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "c0b292c8-26ed-407a-866e-75851577d567", + "metadata": {}, + "outputs": [], + "source": [ + "# join on utility_id_eia and CIK\n", + "preds_validation_df = preds_df.merge(clean_basic_10k_df[[\"record_id\", \"sec_company_id\", \"central_index_key\", \"company_name_raw\"]],\n", + " how=\"left\",\n", + " left_on=\"record_id_l\",\n", + " right_on=\"record_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0", + "metadata": {}, + "outputs": [], + "source": [ + "preds_validation_df = preds_validation_df.merge(clean_eia_df[[\"record_id\", \"utility_id_eia\"]],\n", + " how=\"left\",\n", + " left_on=\"record_id_r\",\n", + " right_on=\"record_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "5103190c-3775-427f-a8f2-cc8a8f79892b", + "metadata": {}, + "outputs": [], + "source": [ + "preds_validation_df = preds_validation_df.sort_values(\n", + " by=[\"sec_company_id\", \"utility_id_eia\", \"match_probability\"], ascending=False\n", + ").drop_duplicates(subset=[\"sec_company_id\", \"utility_id_eia\"], keep=\"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rgamma_street_addresstf_street_address_ltf_street_address_rbf_street_addressbf_tf_adj_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_keyrecord_id_xsec_company_idcentral_index_keycompany_name_rawrecord_id_yutility_id_eia
90773.8130650.933579__splink__input_table_0__splink__input_table_1146926293craneentergy nuclear power marketing00.0000120.0000120.9860461.0100 first stamford pl100 first stamford pl20.0001220.00012211404.7441470.078834ctct10.0209740.02097415.4322792.506898stamfordstamford20.0039620.00396297.4648331.770928KRNENTRJ NKLR PWR MRKTNK11469200019440130001944013crane co629355243
4670804.6338110.961280__splink__input_table_0__splink__input_table_1177525535dte electric securitization funding idte sustainable generation00.0000120.0000120.9860461.0one energy plzone energy plz20.0003300.00033011404.7441470.029198mimi10.0151590.01515915.4322793.468588detroitdetroit20.0011490.00114997.4648336.104048TT ELKTRK SKRTSXN FNTNK ITT SSTNBL JNRXN11775200018760680001876068dte electric securitization funding i llc553564331
1154964.6338110.961280__splink__input_table_0__splink__input_table_1177525522dte electric securitization funding idte electric00.0000120.0000370.9860461.0one energy plzone energy plz20.0003300.00033011404.7441470.029198mimi10.0151590.01515915.4322793.468588detroitdetroit20.0011490.00114997.4648336.104048TT ELKTRK SKRTSXN FNTNK ITT ELKTRK01775200018760680001876068dte electric securitization funding i llc55225109
\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", + "9077 3.813065 0.933579 __splink__input_table_0 __splink__input_table_1 14692 6293 crane entergy nuclear power marketing 0 0.000012 0.000012 0.986046 1.0 100 first stamford pl 100 first stamford pl 2 0.000122 0.000122 11404.744147 0.078834 ct ct 1 0.020974 0.020974 15.432279 2.506898 stamford stamford 2 0.003962 0.003962 97.464833 1.770928 KRN ENTRJ NKLR PWR MRKTNK 1 14692 0001944013 0001944013 crane co 6293 55243\n", + "467080 4.633811 0.961280 __splink__input_table_0 __splink__input_table_1 17752 5535 dte electric securitization funding i dte sustainable generation 0 0.000012 0.000012 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 11404.744147 0.029198 mi mi 1 0.015159 0.015159 15.432279 3.468588 detroit detroit 2 0.001149 0.001149 97.464833 6.104048 TT ELKTRK SKRTSXN FNTNK I TT SSTNBL JNRXN 1 17752 0001876068 0001876068 dte electric securitization funding i llc 5535 64331\n", + "115496 4.633811 0.961280 __splink__input_table_0 __splink__input_table_1 17752 5522 dte electric securitization funding i dte electric 0 0.000012 0.000037 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 11404.744147 0.029198 mi mi 1 0.015159 0.015159 15.432279 3.468588 detroit detroit 2 0.001149 0.001149 97.464833 6.104048 TT ELKTRK SKRTSXN FNTNK I TT ELKTRK 0 17752 0001876068 0001876068 dte electric securitization funding i llc 5522 5109" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_validation_df[preds_validation_df.match_probability > .9].head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "07fbec17-cef2-4b9c-a005-1623c65c5e20", + "metadata": {}, + "source": [ + "Figure out what to do about this validation CSV, maybe it should be part of package data? It's not a very big sample size and it's imperfect so the metrics gained from it are should be taken with a grain of salt." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "11190456-12a9-49df-b863-7a6f674e39eb", + "metadata": {}, + "outputs": [], + "source": [ + "from mozilla_sec_eia.library.validation_helpers import load_validation_data\n", + "\n", + "validation_df = load_validation_data(\"sec_eia_validation_set.csv\")\n", + "validation_df = validation_df.astype({\"central_index_key\": str})" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81", + "metadata": {}, + "outputs": [], + "source": [ + "validation_df[\"central_index_key\"] = validation_df[\"central_index_key\"].str.zfill(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = validation_df.merge(\n", + " preds_validation_df[[\"record_id_l\", \"record_id_r\", \"central_index_key\", \"utility_id_eia\", \"match_probability\", \"gamma_company_name_no_legal\"]].drop_duplicates(keep=\"first\"),\n", + " how=\"left\",\n", + " on=[\"central_index_key\", \"utility_id_eia\"],\n", + " indicator=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "4d45f339-7a5b-466a-81f5-c71e425a77df", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df[\"predicted_match\"] = merged_df[\"_merge\"].map({\"both\": 1, \"left_only\": 0})" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df[\"predicted_match\"] = merged_df[\"predicted_match\"].where(\n", + " (merged_df.match_probability > .95),\n", + " 0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
central_index_keyutility_id_eiasec_company_nameeia_company_namematchrecord_id_lrecord_id_rmatch_probabilitygamma_company_name_no_legal_mergepredicted_match
00000003153195alabama power coNaN11701.0478.01.0000002.0both1.0
1000186894158702fluence energy, inc.Fluence021792.06889.00.0165900.0both0.0
200000410917140georgia power coNaN123416.07653.00.9999972.0both1.0
300000221984062columbus southern power co /oh/Columbus Southern Power Co113310.04281.00.9999811.0both1.0
400013261605416duke energy corpNaN117793.05564.00.9272502.0both0.0
5000003037154905duke energy carolinas, llcDuke Energy Carolinas LLC117790.05558.00.9999872.0both1.0
6000086944657140berkshire realty co inc /deBerkshire Wind Power Cooperative Corp07449.01712.00.0019120.0both0.0
7000009212218195southern cosouthern co services inc050962.017068.00.0072530.0both0.0
8000009212217650southern coSouthern Power Co050963.017089.00.0342260.0both0.0
9000007548814328pacific gas & electric coNaN141598.013933.00.9999482.0both1.0
1000010312966526firstenergy corpFirstEnergy021579.06776.00.9999982.0both1.0
11000103129654776firstenergy corpFirstEnergy Nuclear Generation Corp021579.06780.00.9865400.0both1.0
1200010312966458firstenergy corpFirst Energy Services021579.06763.00.0854520.0both0.0
13000103129632208firstenergy corpFirst Energy Corp1NaNNaNNaNNaNleft_only0.0
14000010012224211tucson electric power coNaN155725.018901.01.0000002.0both1.0
15000009627118454tampa electric coNaN153604.018180.00.9910552.0both1.0
1600007159575248dominion energy, incNaN117484.05386.00.9999852.0both1.0
17000101387159883nrg energy, incNRG Energy Gas & Wind Holdings Inc040084.013240.00.2989820.0both0.0
18000101387113377nrg energy incNRG Energy Inc140084.013243.00.9998052.0both1.0
19000078881613994oglethorpe power corpNaN140576.013515.01.0000002.0both1.0
2000000186753266central maine power coNaN110876.03424.01.0000002.0both1.0
21000103220861296sempra energySempra Generation149303.016270.00.5593700.0both0.0
220000004904488american electric power co incAmerican Electric Power Inc12926.0793.00.9960752.0both1.0
2300007159575248dominion energy, incDominion Energy Inc.117484.05386.00.9999852.0both1.0
\n", + "
" + ], + "text/plain": [ + " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", + "0 0000003153 195 alabama power co NaN 1 1701.0 478.0 1.000000 2.0 both 1.0\n", + "1 0001868941 58702 fluence energy, inc. Fluence 0 21792.0 6889.0 0.016590 0.0 both 0.0\n", + "2 0000041091 7140 georgia power co NaN 1 23416.0 7653.0 0.999997 2.0 both 1.0\n", + "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 13310.0 4281.0 0.999981 1.0 both 1.0\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927250 2.0 both 0.0\n", + "5 0000030371 54905 duke energy carolinas, llc Duke Energy Carolinas LLC 1 17790.0 5558.0 0.999987 2.0 both 1.0\n", + "6 0000869446 57140 berkshire realty co inc /de Berkshire Wind Power Cooperative Corp 0 7449.0 1712.0 0.001912 0.0 both 0.0\n", + "7 0000092122 18195 southern co southern co services inc 0 50962.0 17068.0 0.007253 0.0 both 0.0\n", + "8 0000092122 17650 southern co Southern Power Co 0 50963.0 17089.0 0.034226 0.0 both 0.0\n", + "9 0000075488 14328 pacific gas & electric co NaN 1 41598.0 13933.0 0.999948 2.0 both 1.0\n", + "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986540 0.0 both 1.0\n", + "12 0001031296 6458 firstenergy corp First Energy Services 0 21579.0 6763.0 0.085452 0.0 both 0.0\n", + "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", + "14 0000100122 24211 tucson electric power co NaN 1 55725.0 18901.0 1.000000 2.0 both 1.0\n", + "15 0000096271 18454 tampa electric co NaN 1 53604.0 18180.0 0.991055 2.0 both 1.0\n", + "16 0000715957 5248 dominion energy, inc NaN 1 17484.0 5386.0 0.999985 2.0 both 1.0\n", + "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 40084.0 13240.0 0.298982 0.0 both 0.0\n", + "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 40084.0 13243.0 0.999805 2.0 both 1.0\n", + "19 0000788816 13994 oglethorpe power corp NaN 1 40576.0 13515.0 1.000000 2.0 both 1.0\n", + "20 0000018675 3266 central maine power co NaN 1 10876.0 3424.0 1.000000 2.0 both 1.0\n", + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559370 0.0 both 0.0\n", + "22 0000004904 488 american electric power co inc American Electric Power Inc 1 2926.0 793.0 0.996075 2.0 both 1.0\n", + "23 0000715957 5248 dominion energy, inc Dominion Energy Inc. 1 17484.0 5386.0 0.999985 2.0 both 1.0" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.head(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea", + "metadata": {}, + "outputs": [], + "source": [ + "precision = precision_score(merged_df[\"match\"], merged_df[\"predicted_match\"])\n", + "recall = recall_score(merged_df[\"match\"], merged_df[\"predicted_match\"])\n", + "accuracy = accuracy_score(merged_df[\"match\"], merged_df[\"predicted_match\"])\n", + "# roc_auc = roc_auc_score(merged_df['match'], merged_df['match_probability'])\n", + "\n", + "# Confusion matrix\n", + "conf_matrix = confusion_matrix(merged_df[\"match\"], merged_df[\"predicted_match\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.8666666666666667, 0.8125, 0.7916666666666666)" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "precision, recall, accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "08932be5-b90c-440d-9efb-156cb4d63c93", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Predicted NegativePredicted Positive
Negative62
Positive313
\n", + "
" + ], + "text/plain": [ + " Predicted Negative Predicted Positive\n", + "Negative 6 2\n", + "Positive 3 13" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(\n", + " conf_matrix,\n", + " index=[\"Negative\", \"Positive\"],\n", + " columns=[\"Predicted Negative\", \"Predicted Positive\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "025c80e9-5055-4eaa-a873-38b910cd7f94", + "metadata": {}, + "outputs": [], + "source": [ + "incorrect_df = merged_df[merged_df.match != merged_df.predicted_match]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
central_index_keyutility_id_eiasec_company_nameeia_company_namematchrecord_id_lrecord_id_rmatch_probabilitygamma_company_name_no_legal_mergepredicted_match
400013261605416duke energy corpNaN117793.05564.00.9272502.0both0.0
1000010312966526firstenergy corpFirstEnergy021579.06776.00.9999982.0both1.0
11000103129654776firstenergy corpFirstEnergy Nuclear Generation Corp021579.06780.00.9865400.0both1.0
13000103129632208firstenergy corpFirst Energy Corp1NaNNaNNaNNaNleft_only0.0
21000103220861296sempra energySempra Generation149303.016270.00.5593700.0both0.0
\n", + "
" + ], + "text/plain": [ + " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927250 2.0 both 0.0\n", + "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986540 0.0 both 1.0\n", + "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559370 0.0 both 0.0" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "incorrect_df" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "c425a676-aa6e-4d8f-b814-931da392c2ff", + "metadata": {}, + "outputs": [], + "source": [ + "recs_to_view = []\n", + "for idx, rec in incorrect_df.iterrows():\n", + " full_rec = preds_validation_df[\n", + " (preds_validation_df.record_id_l == rec.record_id_l) &\n", + " (preds_validation_df.record_id_r == rec.record_id_r)\n", + " ].squeeze()\n", + " if full_rec.empty:\n", + " continue\n", + " recs_to_view.append(full_rec.to_dict())" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "ff55f2cb-7ce1-4697-99e7-bf22918f7ed1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.waterfall_chart(recs_to_view, filter_nulls=True)" + ] + }, + { + "cell_type": "markdown", + "id": "a2ba43b6-a664-462a-823f-e3f08585bb51", + "metadata": {}, + "source": [ + "# Save good predictions\n", + "Make the predictions one to one. First, keep the highest probability EIA utility ID for each SEC company. Then, keep the highest probability SEC company for each EIA utility" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "92172e2f-39ba-49e3-8312-98597256ca4f", + "metadata": {}, + "outputs": [], + "source": [ + "one_to_one_preds = preds_validation_df[preds_validation_df.match_probability >= .95].sort_values(\n", + " by=\"match_probability\", ascending=False\n", + ").drop_duplicates(\n", + " subset=\"sec_company_id\", keep=\"first\"\n", + ").drop_duplicates(\n", + " subset=\"utility_id_eia\", keep=\"first\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "07ca81ae-1b26-4cd3-ade6-75381028028a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "529" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(one_to_one_preds)" + ] + }, + { + "cell_type": "markdown", + "id": "c3db3175-7cf3-497c-8f22-e68a6c9c6af2", + "metadata": {}, + "source": [ + "# Add `utility_id_eia` onto the SEC table to create output table" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "361b3e30-e823-4137-9062-6a00eae537fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rgamma_street_addresstf_street_address_ltf_street_address_rbf_street_addressbf_tf_adj_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_keyrecord_id_xsec_company_idcentral_index_keycompany_name_rawrecord_id_yutility_id_eia
13731529.2223271.000000__splink__input_table_0__splink__input_table_1205886741fibermarkfibermark20.0000370.000037416213.6769190.033156161 wellington rd161 wellington rd20.0000240.00002411404.7441470.394168vtvt10.0015250.00152515.43227934.490071brattleborobrattleboro20.0000860.00008697.46483381.968648FBRMRKFBRMRK02058800008875910000887591fibermark inc67416309
38148227.8893611.000000__splink__input_table_0__splink__input_table_15156717450st joseph light and powerst joseph light and power20.0000240.000024416213.6769190.049733520 francis st520 francis st20.0000240.00002411404.7441470.394168momo10.0100820.01008215.4322795.215572st josephst joseph20.0000490.00004997.464833143.445134ST JSF LT ANT PWRST JSF LT ANT PWR05156700000862510000086251st joseph light & power co1745017881
43782327.7590731.000000__splink__input_table_0__splink__input_table_15884219906wausau paper millswausau paper mills20.0000240.000024416213.6769190.049733one clarks isone clarks is20.0000240.00002411404.7441470.394168wiwi10.0088270.00882715.4322795.956503wausauwausau20.0000610.00006197.464833114.756107WS PPR MLSWS PPR MLS05884200001050760000105076wausau paper mills co1990620190
21455227.5378291.000000__splink__input_table_0__splink__input_table_1246508047green mountain powergreen mountain power20.0000370.000037416213.6769190.033156163 acorn ln163 acorn ln20.0000370.00003711404.7441470.262779vtvt10.0015250.00152515.43227934.490071colchestercolchester20.0001830.00018397.46483338.252036KRN MNTN PWRKRN MNTN PWR02465000000437040000043704green mountain power corp80477601
30412527.5193321.000000__splink__input_table_0__splink__input_table_13981613109northwestern public servicenorthwestern public service20.0000730.000073416213.6769190.01657833 third st se33 third st se20.0000370.00003711404.7441470.262779sdsd10.0019300.00193015.43227927.240566huronhuron20.0000730.00007397.46483395.630090NR0WSTRN PBLK SRFSNR0WSTRN PBLK SRFS03981600000730880000073088northwestern public service co1310913809
....................................................................................................................................
87934.3276380.952560__splink__input_table_0__splink__input_table_1586257565warrantechgeneration bridge ii connecticut00.0000240.0000120.9860461.000000300 atlantic st300 atlantic st20.0000860.00008611404.7441470.112619ctct10.0209740.02097415.4322792.506898stamfordstamford20.0039620.00396297.4648331.770928WRNTXJNRXN BRJ KNKTKT15862500007355710000735571warrantech corp756565448
75034.3276380.952560__splink__input_table_0__splink__input_table_1423897567peerless systemsgeneration bridge ii new york00.0000490.0000120.9860461.000000300 atlantic st300 atlantic st20.0000860.00008611404.7441470.112619ctct10.0209740.02097415.4322792.506898stamfordstamford20.0039620.00396297.4648331.770928PRLS SSTMSJNRXN BRJ N YRK14238900008978930000897893peerless systems corp756765417
4642584.2734000.950832__splink__input_table_0__splink__input_table_1162614089airplanes us trustpasadena statutory trust00.0000120.0000120.9860461.0000001100 north market st1100 north market st20.0000610.00006111404.7441470.157667dede10.0117040.01170415.4322794.492404wilmingtonwilmington20.0103210.01032197.4648330.679835ARPLNS US TRSTPSTN STTTR TRST1162600010045400001004540airplanes us trust1408961235
4641734.2734000.950832__splink__input_table_0__splink__input_table_116507605aisystemsgenon sabine delaware00.0000240.0000120.9860461.0000002711 centerville rd2711 centerville rd20.0000610.00006111404.7441470.157667dede10.0117040.01170415.4322794.492404wilmingtonwilmington20.0103210.01032197.4648330.679835ASSTMSJNN SBN TLWR1165000013287690001328769aisystems, inc.760556922
91804.2734000.950832__splink__input_table_0__splink__input_table_11917416368enovisshannon wind00.0000120.0000240.9860461.0000002711 centerville rd2711 centerville rd20.0000610.00006111404.7441470.157667dede10.0117040.01170415.4322794.492404wilmingtonwilmington20.0103210.01032197.4648330.679835ENFSXNN WNT11917400014208000001420800enovis corp1636858872
\n", + "

529 rows × 43 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", + "137315 29.222327 1.000000 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 416213.676919 0.033156 161 wellington rd 161 wellington rd 2 0.000024 0.000024 11404.744147 0.394168 vt vt 1 0.001525 0.001525 15.432279 34.490071 brattleboro brattleboro 2 0.000086 0.000086 97.464833 81.968648 FBRMRK FBRMRK 0 20588 0000887591 0000887591 fibermark inc 6741 6309\n", + "381482 27.889361 1.000000 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 416213.676919 0.049733 520 francis st 520 francis st 2 0.000024 0.000024 11404.744147 0.394168 mo mo 1 0.010082 0.010082 15.432279 5.215572 st joseph st joseph 2 0.000049 0.000049 97.464833 143.445134 ST JSF LT ANT PWR ST JSF LT ANT PWR 0 51567 0000086251 0000086251 st joseph light & power co 17450 17881\n", + "437823 27.759073 1.000000 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 416213.676919 0.049733 one clarks is one clarks is 2 0.000024 0.000024 11404.744147 0.394168 wi wi 1 0.008827 0.008827 15.432279 5.956503 wausau wausau 2 0.000061 0.000061 97.464833 114.756107 WS PPR MLS WS PPR MLS 0 58842 0000105076 0000105076 wausau paper mills co 19906 20190\n", + "214552 27.537829 1.000000 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 416213.676919 0.033156 163 acorn ln 163 acorn ln 2 0.000037 0.000037 11404.744147 0.262779 vt vt 1 0.001525 0.001525 15.432279 34.490071 colchester colchester 2 0.000183 0.000183 97.464833 38.252036 KRN MNTN PWR KRN MNTN PWR 0 24650 0000043704 0000043704 green mountain power corp 8047 7601\n", + "304125 27.519332 1.000000 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 416213.676919 0.016578 33 third st se 33 third st se 2 0.000037 0.000037 11404.744147 0.262779 sd sd 1 0.001930 0.001930 15.432279 27.240566 huron huron 2 0.000073 0.000073 97.464833 95.630090 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0 39816 0000073088 0000073088 northwestern public service co 13109 13809\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "8793 4.327638 0.952560 __splink__input_table_0 __splink__input_table_1 58625 7565 warrantech generation bridge ii connecticut 0 0.000024 0.000012 0.986046 1.000000 300 atlantic st 300 atlantic st 2 0.000086 0.000086 11404.744147 0.112619 ct ct 1 0.020974 0.020974 15.432279 2.506898 stamford stamford 2 0.003962 0.003962 97.464833 1.770928 WRNTX JNRXN BRJ KNKTKT 1 58625 0000735571 0000735571 warrantech corp 7565 65448\n", + "7503 4.327638 0.952560 __splink__input_table_0 __splink__input_table_1 42389 7567 peerless systems generation bridge ii new york 0 0.000049 0.000012 0.986046 1.000000 300 atlantic st 300 atlantic st 2 0.000086 0.000086 11404.744147 0.112619 ct ct 1 0.020974 0.020974 15.432279 2.506898 stamford stamford 2 0.003962 0.003962 97.464833 1.770928 PRLS SSTMS JNRXN BRJ N YRK 1 42389 0000897893 0000897893 peerless systems corp 7567 65417\n", + "464258 4.273400 0.950832 __splink__input_table_0 __splink__input_table_1 1626 14089 airplanes us trust pasadena statutory trust 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 11404.744147 0.157667 de de 1 0.011704 0.011704 15.432279 4.492404 wilmington wilmington 2 0.010321 0.010321 97.464833 0.679835 ARPLNS US TRST PSTN STTTR TRST 1 1626 0001004540 0001004540 airplanes us trust 14089 61235\n", + "464173 4.273400 0.950832 __splink__input_table_0 __splink__input_table_1 1650 7605 aisystems genon sabine delaware 0 0.000024 0.000012 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 11404.744147 0.157667 de de 1 0.011704 0.011704 15.432279 4.492404 wilmington wilmington 2 0.010321 0.010321 97.464833 0.679835 ASSTMS JNN SBN TLWR 1 1650 0001328769 0001328769 aisystems, inc. 7605 56922\n", + "9180 4.273400 0.950832 __splink__input_table_0 __splink__input_table_1 19174 16368 enovis shannon wind 0 0.000012 0.000024 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 11404.744147 0.157667 de de 1 0.011704 0.011704 15.432279 4.492404 wilmington wilmington 2 0.010321 0.010321 97.464833 0.679835 ENFS XNN WNT 1 19174 0001420800 0001420800 enovis corp 16368 58872\n", + "\n", + "[529 rows x 43 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "one_to_one_preds" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "1d3e41bd-f92a-4f77-a0a7-0bd24f7ea70c", + "metadata": {}, + "outputs": [], + "source": [ + "out_df = clean_basic_10k_df.merge(\n", + " one_to_one_preds[[\"sec_company_id\", \"utility_id_eia\"]],\n", + " how=\"left\",\n", + " on=\"sec_company_id\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "cce2b383-48b3-4efd-977a-0c734b0e3ec2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "utility_id_eia\n", + "True 59900\n", + "False 1126\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out_df.utility_id_eia.isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "7385664a-4a40-44e8-b7bf-917a623be158", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['record_id', 'company_name', 'street_address', 'filename', 'phone_number', 'central_index_key', 'city', 'company_name_raw', 'date_of_name_change', 'film_number', 'fiscal_year_end', 'form_type', 'former_conformed_name', 'irs_number', 'organization_name', 'sec_act', 'sec_file_number', 'standard_industrial_classification', 'state', 'state_of_incorporation', 'street_address_2', 'zip_code', 'report_date', 'report_year', 'location_of_inc', 'company_name_no_legal', 'company_name_mphone', 'files_10k', 'sec_company_id', 'utility_id_eia'], dtype='object')" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c9f50d5-f612-4887-b3ef-1ba810973d1a", + "metadata": {}, + "outputs": [], + "source": [ + "dagstermill.yield_result(out_df, output_name=\"sec_10k_filers_matched_df\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bfccfa0-17eb-4373-8a89-b578ff00349e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index 666f010..29e4595 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -7,7 +7,8 @@ import numpy as np import pandas as pd -from dagster import AssetIn, asset +from dagster import AssetIn, asset, file_relative_path +from dagstermill import define_dagstermill_asset from mozilla_sec_eia.library.record_linkage_utils import ( expand_street_name_abbreviations, @@ -172,6 +173,12 @@ def match_ex21_subsidiaries_to_filer_company( lambda row: len(set(row["loc_tokens_sec"]) & set(row["loc_tokens_ex21"])), axis=1, ) + merged_df = merged_df.fillna( + { + "report_year_sec": 0, + "report_year_ex21": 0, + } + ) # get the difference in report years merged_df["report_year_diff"] = merged_df.apply( lambda row: abs(int(row["report_year_sec"]) - int(row["report_year_ex21"])), @@ -316,7 +323,7 @@ def transform_basic10k_table( "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"), }, ) -def core_sec_10k__filers( +def transformed_basic_10k( basic_10k_dfs: dict[str, pd.DataFrame], sec10k_filing_metadata_dfs: dict[str, pd.DataFrame], ) -> pd.DataFrame: @@ -330,11 +337,21 @@ def core_sec_10k__filers( sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata) out_df = basic_10k_df.fillna(np.nan).reset_index(names="record_id") - # match EIA utilities to filers - # TODO: Here we conduct the match to EIA and add on a column with utility_id_eia + return out_df +core_sec_10k__filers = define_dagstermill_asset( + "core_sec_10k__filers", + notebook_path=file_relative_path(__file__, "./notebooks/splink-sec-eia.ipynb"), + ins={ + "clean_eia_df": AssetIn("core_eia__parents_and_subsidiaries"), + "clean_basic_10k_df": AssetIn("transformed_basic_10k"), + }, + save_notebook_on_failure=True, +) + + @asset( ins={ "sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"), @@ -386,7 +403,8 @@ def out_sec_10k__parents_and_subsidiaries( production_assets = [ - core_sec_10k__filers, + transformed_basic_10k, transformed_ex21_subsidiary_table, + core_sec_10k__filers, out_sec_10k__parents_and_subsidiaries, ] From 4474239ec0561afaef259a779eafea7c8ff0d7d9 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 15 Jan 2025 15:20:42 -0500 Subject: [PATCH 11/16] Get notebook asset working correctly --- .../notebooks/splink-sec-eia.ipynb | 32 +++---------------- .../transform_sec_input.py | 7 ++-- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/notebooks/splink-sec-eia.ipynb b/src/mozilla_sec_eia/models/sec_eia_record_linkage/notebooks/splink-sec-eia.ipynb index 1263b43..4da6330 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/notebooks/splink-sec-eia.ipynb +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/notebooks/splink-sec-eia.ipynb @@ -15,9 +15,7 @@ "id": "e136eeca-133b-4f61-8e37-5e1ea8c99683", "metadata": {}, "outputs": [], - "source": [ - "import dagstermill" - ] + "source": [] }, { "cell_type": "code", @@ -3742,39 +3740,19 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 66, "id": "7385664a-4a40-44e8-b7bf-917a623be158", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['record_id', 'company_name', 'street_address', 'filename', 'phone_number', 'central_index_key', 'city', 'company_name_raw', 'date_of_name_change', 'film_number', 'fiscal_year_end', 'form_type', 'former_conformed_name', 'irs_number', 'organization_name', 'sec_act', 'sec_file_number', 'standard_industrial_classification', 'state', 'state_of_incorporation', 'street_address_2', 'zip_code', 'report_date', 'report_year', 'location_of_inc', 'company_name_no_legal', 'company_name_mphone', 'files_10k', 'sec_company_id', 'utility_id_eia'], dtype='object')" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "out_df.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c9f50d5-f612-4887-b3ef-1ba810973d1a", - "metadata": {}, "outputs": [], "source": [ - "dagstermill.yield_result(out_df, output_name=\"sec_10k_filers_matched_df\")" + "# Hacky write directly to GCS\n", + "out_df.to_parquet(\"gs://sec10k-outputs/v2/core_sec_10k__filers.parquet\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "9bfccfa0-17eb-4373-8a89-b578ff00349e", + "id": "2a1aa1c1-c497-439a-a66f-71d308772fce", "metadata": {}, "outputs": [], "source": [] diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index 29e4595..bbdf271 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -354,13 +354,13 @@ def transformed_basic_10k( @asset( ins={ - "sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"), "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), "clean_eia_df": AssetIn("core_eia__parents_and_subsidiaries"), }, + deps=["core_sec_10k__filers"], + io_manager_key="pandas_parquet_io_manager", ) def out_sec_10k__parents_and_subsidiaries( - sec_10k_filers_matched_df: pd.DataFrame, clean_ex21_df: pd.DataFrame, clean_eia_df: pd.DataFrame, ) -> pd.DataFrame: @@ -370,6 +370,9 @@ def out_sec_10k__parents_and_subsidiaries( filing companies. Create an sec_company_id for subsidiaries that aren't linked to a CIK. """ + sec_10k_filers_matched_df = pd.read_parquet( + "gs://sec10k-outputs/v2/core_sec_10k__filers.parquet" + ) ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( basic10k_df=sec_10k_filers_matched_df, ex21_df=clean_ex21_df ) From be2f6a534b5a0928c119c5e0dd5ed23295eaf080 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Wed, 15 Jan 2025 21:57:20 -0800 Subject: [PATCH 12/16] add pandera schemas --- src/mozilla_sec_eia/models/sec10k/entities.py | 119 ++++++++++++++++++ .../transform_eia_input.py | 2 + .../transform_sec_input.py | 5 + 3 files changed, 126 insertions(+) diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py index 2ee5b23..9198259 100644 --- a/src/mozilla_sec_eia/models/sec10k/entities.py +++ b/src/mozilla_sec_eia/models/sec10k/entities.py @@ -63,7 +63,126 @@ class Ex21Layout(pa.DataFrameModel): ) +class Sec10kCoreTable(pa.DataFrameModel): + """Define table structure for core SEC companies table.""" + + sec_company_id: Series[str] = pa.Field( + description="Assigned identifier for the company." + ) + filename: Series[str] = pa.Field(description="Name of extracted filing.") + central_index_key: Series[str] = pa.Field( + description="Identifier of the company in SEC database." + ) + report_date: Series[pa.DateTime] = pa.Field( + description="Report date of the record." + ) + company_name: Series[str] = pa.Field( + description="Cleaned name of the company with legal terms expanded." + ) + utility_id_eia: Series[int] = pa.Field( + description="EIA utility identifier for the company. Matched via record linkage model.", + nullable=True, + ) + street_address: Series[str] = pa.Field( + description="Street address of the company.", nullable=True + ) + street_address_2: Series[str] = pa.Field( + description="Secondary street address of the company.", nullable=True + ) + phone_number: Series[str] = pa.Field( + description="Phone number of company.", nullable=True + ) + city: Series[str] = pa.Field( + description="The city where the company is located.", nullable=True + ) + state: Series[str] = pa.Field( + description="Two letter state code where the company is located.", nullable=True + ) + state_of_incorporation: Series[str] = pa.Field( + description="Two letter state code where the company is located.", nullable=True + ) + zip_code: Series[str] = pa.Field( + description="5 digit zip code where the company is located.", nullable=True + ) + company_name_raw: Series[str] = pa.Field( + description="The raw company name.", nullable=True + ) + date_of_name_change: Series[pa.DateTime] = pa.Field( + description="Date of last name change of the company.", nullable=True + ) + location_of_inc: Series[str] = pa.Field( + description="Cleaned location of incorporation of the company.", nullable=True + ) + company_name_no_legal: Series[str] = pa.Field( + description="Company name with legal terms stripped, e.g. LLC", nullable=True + ) + company_name_mphone: Series[str] = pa.Field( + description="Metaphone of the company name, could used for record linkage." + ) + files_10k: Series[bool] = pa.Field( + description="Indicates whether the company files a 10-K." + ) + + +class Sec10kOutputTable(pa.DataFrameModel): + """Define table structure for output parents and subsidiaries table.""" + + parent_company_cik: Series[str] = pa.Field( + description="CIK of the company's parent company.", nullable=True + ) + own_per: Series[float] = pa.Field( + description="Parent company's ownership percentage of the company.", + nullable=True, + ) + + +class EiaCompanies(pa.DataFrameModel): + """Define table structure for EIA owner and operator companies table.""" + + company_name: Series[str] = pa.Field( + description="Cleaned name of the owner or operator company with legal terms expanded." + ) + street_address: Series[str] = pa.Field( + description="Street address of the company.", nullable=True + ) + street_address_2: Series[str] = pa.Field( + description="Secondary street address of the company.", nullable=True + ) + utility_id_eia: Series[int] = pa.Field( + description="EIA utility identifier for the company.", coerce=True + ) + company_name_raw: Series[str] = pa.Field(description="The raw company name.") + # TODO: What type for type expression? + report_date: Series[pa.DateTime] = pa.Field( + description="Report date of the record." + ) + report_year: Series[int] = pa.Field( + description="Report year of the record.", coerce=True + ) + city: Series[str] = pa.Field( + description="The city where the company is located.", nullable=True + ) + state: Series[str] = pa.Field( + description="Two letter state code where the company is located.", nullable=True + ) + zip_code: Series[str] = pa.Field( + description="5 digit zip code where the company is located.", nullable=True + ) + phone_number: Series[str] = pa.Field( + description="Phone number of company.", nullable=True + ) + company_name_no_legal: Series[str] = pa.Field( + description="Company name with legal terms stripped, e.g. LLC", nullable=True + ) + company_name_mphone: Series[str] = pa.Field( + description="Metaphone of the company name, could used for record linkage." + ) + + ex21_extract_type = pandera_schema_to_dagster_type(Ex21CompanyOwnership) basic_10k_extract_type = pandera_schema_to_dagster_type(Basic10kCompanyInfo) sec10k_extract_metadata_type = pandera_schema_to_dagster_type(Sec10kExtractionMetadata) ex21_layout_type = pandera_schema_to_dagster_type(Ex21Layout) +eia_layout_type = pandera_schema_to_dagster_type(EiaCompanies) +sec10k_output_layout_type = pandera_schema_to_dagster_type(Sec10kOutputTable) +sec10k_core_layout_type = pandera_schema_to_dagster_type(Sec10kCoreTable) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py index b12ac71..4da37eb 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py @@ -10,6 +10,7 @@ flatten_companies_across_time, transform_company_name, ) +from mozilla_sec_eia.models.sec10k.entities import eia_layout_type from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS EIA_COL_MAP = { @@ -77,6 +78,7 @@ def harvest_eia861_utilities(): @asset( name="core_eia__parents_and_subsidiaries", io_manager_key="pandas_parquet_io_manager", + dagster_type=eia_layout_type, ) # TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS? def eia_rl_input_table(): diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index bbdf271..930a29d 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -16,6 +16,9 @@ flatten_companies_across_time, transform_company_name, ) +from mozilla_sec_eia.models.sec10k.entities import ( + sec10k_output_layout_type, +) from mozilla_sec_eia.models.sec10k.utils.cloud import ( convert_ex21_id_to_filename, ) @@ -359,6 +362,7 @@ def transformed_basic_10k( }, deps=["core_sec_10k__filers"], io_manager_key="pandas_parquet_io_manager", + dagster_type=sec10k_output_layout_type, ) def out_sec_10k__parents_and_subsidiaries( clean_ex21_df: pd.DataFrame, @@ -373,6 +377,7 @@ def out_sec_10k__parents_and_subsidiaries( sec_10k_filers_matched_df = pd.read_parquet( "gs://sec10k-outputs/v2/core_sec_10k__filers.parquet" ) + sec_10k_filers_matched_df = sec_10k_filers_matched_df.drop(columns="record_id") ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( basic10k_df=sec_10k_filers_matched_df, ex21_df=clean_ex21_df ) From a443c0abdcaaa26939a7c0ce06997c54914b892f Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 16 Jan 2025 11:07:49 -0500 Subject: [PATCH 13/16] Add pyarrow schemas for final tables --- src/mozilla_sec_eia/models/schema.py | 134 ++++++++++++++++++ src/mozilla_sec_eia/models/sec10k/entities.py | 4 +- .../models/sec_eia_record_linkage/entities.py | 20 +++ 3 files changed, 156 insertions(+), 2 deletions(-) create mode 100644 src/mozilla_sec_eia/models/schema.py create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/entities.py diff --git a/src/mozilla_sec_eia/models/schema.py b/src/mozilla_sec_eia/models/schema.py new file mode 100644 index 0000000..68852ec --- /dev/null +++ b/src/mozilla_sec_eia/models/schema.py @@ -0,0 +1,134 @@ +"""Define schema for PUDL models.""" + +import pyarrow as pa + +PUDL_MODELS_SCHEMA = { + "core_sec10k__filings": pa.schema( + [ + pa.field( + "filename", + pa.string(), + metadata={ + "description": "Name of filing as provided by SEC data portal." + }, + ), + pa.field( + "cik", + pa.string(), + metadata={ + "description": "SEC Central Index Key, which uniquely identifies corporations." + }, + ), + pa.field( + "company_name", + pa.string(), + metadata={"description": "Name of company submitting filing."}, + ), + pa.field( + "form_type", + pa.string(), + metadata={"description": "Specific version of SEC 10k filed."}, + ), + pa.field( + "date_filed", + pa.date64(), + metadata={"description": "Date filing was submitted."}, + ), + pa.field( + "exhibit_21_version", + pa.string(), + metadata={ + "description": "Version of exhibit 21 submitted (if applicable)." + }, + ), + pa.field( + "year_quarter", + pa.string(), + metadata={"description": "Year quarter filing applies to."}, + ), + ], + metadata={"description": "Metadata describing all submitted SEC 10k filings."}, + ), + "core_sec10k__exhibit_21_company_ownership": pa.schema( + [ + pa.field( + "filename", + pa.string(), + metadata={ + "description": "Name of filing as provided by SEC data portal." + }, + ), + pa.field( + "subsidiary", + pa.string(), + metadata={"description": "Name of subsidiary company."}, + ), + pa.field( + "location", + pa.string(), + metadata={"description": "Location of subsidiary company."}, + ), + pa.field( + "ownership_percentage", + pa.string(), + metadata={ + "description": "Percentage of subsidiary company owned by parent." + }, + ), + pa.field( + "year_quarter", + pa.string(), + metadata={"description": "Year quarter filing applies to."}, + ), + ], + metadata={ + "description": "Company ownership data extracted from Exhibit 21 attachments to SEC 10k filings." + }, + ), + "core_sec10k__company_information": pa.schema( + [ + pa.field( + "filename", + pa.string(), + metadata={ + "description": "Name of filing as provided by SEC data portal." + }, + ), + pa.field( + "filer_count", + pa.int64(), + metadata={ + "description": "Index company information as some filings contain information for multiple companies." + }, + ), + pa.field( + "block", + pa.string(), + metadata={"description": "Title of block of data."}, + ), + pa.field( + "block_count", + pa.int64(), + metadata={ + "description": "Some blocks are repeated, `block_count` defines the index of the data block." + }, + ), + pa.field( + "key", + pa.string(), + metadata={"description": "Key within block."}, + ), + pa.field( + "value", + pa.string(), + metadata={"description": "String value of data point."}, + ), + pa.field( + "year_quarter", + pa.string(), + metadata={"description": "Year quarter filing applies to."}, + ), + ], + metadata={"description": "Company information extracted from SEC 10k filings."}, + ), +} diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py index 2ee5b23..ff523e9 100644 --- a/src/mozilla_sec_eia/models/sec10k/entities.py +++ b/src/mozilla_sec_eia/models/sec10k/entities.py @@ -25,11 +25,11 @@ class Basic10kCompanyInfo(pa.DataFrameModel): """Define table structure for extracted basic 10k data.""" filename: Index[str] = pa.Field(description="Name of extracted filing.") - filer_count: Index[str] = pa.Field( + filer_count: Index[int] = pa.Field( description="Some filings have multiple blocks of company data." ) block: Index[str] = pa.Field(description="Block of company data.") - block_count: Index[str] = pa.Field(description="Some blocks occur multiple times.") + block_count: Index[int] = pa.Field(description="Some blocks occur multiple times.") key: Index[str] = pa.Field(description="Key within block.") value: Series[str] = pa.Field(description="Company info fact.") diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/entities.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/entities.py new file mode 100644 index 0000000..4dbefa4 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/entities.py @@ -0,0 +1,20 @@ +"""Define schemas for tables using pandera.""" + +import pandera as pa +from pandera.typing import Series + + +class CoreSec10kFilers(pa.DataFrameModel): + """.""" + + _id: Series[str] = pa.Field(alias="id", description="ID of extracted filing.") + subsidiary: Series[str] = pa.Field(description="Name of subsidiary company.") + loc: Series[str] = pa.Field( + description="Location of subsidiary company.", nullable=True + ) + #: Use str to avoid conversion errors + own_per: Series[str] = pa.Field( + description="Percent ownership of subsidiary company.", + nullable=True, + coerce=True, + ) From 19d206e3569882bd36b89a9a6415bbccd5f41d54 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Mon, 20 Jan 2025 09:55:24 -0800 Subject: [PATCH 14/16] fix pandera schemas --- src/mozilla_sec_eia/models/sec10k/entities.py | 196 +++++++++--------- 1 file changed, 102 insertions(+), 94 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py index 9198259..183ca29 100644 --- a/src/mozilla_sec_eia/models/sec10k/entities.py +++ b/src/mozilla_sec_eia/models/sec10k/entities.py @@ -4,6 +4,53 @@ from dagster_pandera import pandera_schema_to_dagster_type from pandera.typing import Index, Series +sec_company_id = pa.Field(description="Assigned identifier for the company.") +filename = pa.Field(description="Name of extracted filing.") +central_index_key = pa.Field(description="Identifier of the company in SEC database.") +report_date = pa.Field(description="Report date of the record.") +company_name = pa.Field( + description="Cleaned name of the company with legal terms expanded." +) +utility_id_eia = pa.Field( + description="EIA utility identifier for the company. Matched via record linkage model.", + nullable=True, + coerce=True, +) +street_address = pa.Field(description="Street address of the company.", nullable=True) +street_address_2 = pa.Field( + description="Secondary street address of the company.", nullable=True +) +phone_number = pa.Field(description="Phone number of company.", nullable=True) +city = pa.Field(description="The city where the company is located.", nullable=True) +state = pa.Field( + description="Two letter state code where the company is located.", nullable=True +) +state_of_incorporation = pa.Field( + description="Two letter state code where the company is located.", nullable=True +) +zip_code = pa.Field( + description="5 digit zip code where the company is located.", nullable=True +) +company_name_raw = pa.Field(description="The raw company name.", nullable=True) +date_of_name_change = pa.Field( + description="Date of last name change of the company.", nullable=True +) +former_conformed_name = pa.Field(description="Former name of the company.") +standard_industrial_classification = pa.Field( + description="The company's type of business." +) +location_of_inc = pa.Field( + description="Cleaned location of incorporation of the company.", nullable=True +) +company_name_no_legal = pa.Field( + description="Company name with legal terms stripped, e.g. LLC", nullable=True +) +company_name_mphone = pa.Field( + description="Metaphone of the company name, could be used for record linkage." +) +irs_number = pa.Field(description="ID of the company with the IRS.") +files_10k = pa.Field(description="Indicates whether the company files a 10-K.") + class Ex21CompanyOwnership(pa.DataFrameModel): """Define table structure for extracted EX 21 data.""" @@ -24,7 +71,7 @@ class Ex21CompanyOwnership(pa.DataFrameModel): class Basic10kCompanyInfo(pa.DataFrameModel): """Define table structure for extracted basic 10k data.""" - filename: Index[str] = pa.Field(description="Name of extracted filing.") + filename: Index[str] = filename filer_count: Index[str] = pa.Field( description="Some filings have multiple blocks of company data." ) @@ -44,7 +91,7 @@ class Config: class Sec10kExtractionMetadata(pa.DataFrameModel): """Define table structure extraction metadata.""" - filename: Index[str] = pa.Field(description="Name of extracted filing.") + filename: Index[str] = filename success: Series[bool] = pa.Field( description="Indicates whether filing was successfully extracted.", coerce=True ) @@ -56,7 +103,7 @@ class Sec10kExtractionMetadata(pa.DataFrameModel): class Ex21Layout(pa.DataFrameModel): """Define table structure for ex21 layout classification.""" - filename: Index[str] = pa.Field(description="Name of extracted filing.") + filename: Index[str] = filename paragraph: Series[bool] = pa.Field( description="Indicates whether ex21 is formatted as a paragraph or not.", coerce=True, @@ -66,67 +113,51 @@ class Ex21Layout(pa.DataFrameModel): class Sec10kCoreTable(pa.DataFrameModel): """Define table structure for core SEC companies table.""" - sec_company_id: Series[str] = pa.Field( - description="Assigned identifier for the company." - ) - filename: Series[str] = pa.Field(description="Name of extracted filing.") - central_index_key: Series[str] = pa.Field( - description="Identifier of the company in SEC database." - ) - report_date: Series[pa.DateTime] = pa.Field( - description="Report date of the record." - ) - company_name: Series[str] = pa.Field( - description="Cleaned name of the company with legal terms expanded." - ) - utility_id_eia: Series[int] = pa.Field( - description="EIA utility identifier for the company. Matched via record linkage model.", - nullable=True, - ) - street_address: Series[str] = pa.Field( - description="Street address of the company.", nullable=True - ) - street_address_2: Series[str] = pa.Field( - description="Secondary street address of the company.", nullable=True - ) - phone_number: Series[str] = pa.Field( - description="Phone number of company.", nullable=True - ) - city: Series[str] = pa.Field( - description="The city where the company is located.", nullable=True - ) - state: Series[str] = pa.Field( - description="Two letter state code where the company is located.", nullable=True - ) - state_of_incorporation: Series[str] = pa.Field( - description="Two letter state code where the company is located.", nullable=True - ) - zip_code: Series[str] = pa.Field( - description="5 digit zip code where the company is located.", nullable=True - ) - company_name_raw: Series[str] = pa.Field( - description="The raw company name.", nullable=True - ) - date_of_name_change: Series[pa.DateTime] = pa.Field( - description="Date of last name change of the company.", nullable=True - ) - location_of_inc: Series[str] = pa.Field( - description="Cleaned location of incorporation of the company.", nullable=True - ) - company_name_no_legal: Series[str] = pa.Field( - description="Company name with legal terms stripped, e.g. LLC", nullable=True - ) - company_name_mphone: Series[str] = pa.Field( - description="Metaphone of the company name, could used for record linkage." - ) - files_10k: Series[bool] = pa.Field( - description="Indicates whether the company files a 10-K." - ) + sec_company_id: Series[str] = sec_company_id + company_name: Series[str] = company_name + filename: Series[str] = filename + report_date: Series[pa.DateTime] = report_date + central_index_key: Series[str] = central_index_key + utility_id_eia: Series[int] = utility_id_eia + street_address: Series[str] = street_address + street_address_2: Series[str] = street_address_2 + phone_number: Series[str] = phone_number + city: Series[str] = city + state: Series[str] = state + state_of_incorporation: Series[str] = state_of_incorporation + zip_code: Series[str] = zip_code + company_name_raw: Series[str] = company_name_raw + company_name_no_legal: Series[str] = company_name_no_legal + company_name_mphone: Series[str] = company_name_mphone + date_of_name_change: Series[pa.DateTime] = date_of_name_change + former_conformed_name: Series[str] = former_conformed_name + standard_industrial_classification: Series[str] = standard_industrial_classification + location_of_inc: Series[str] = location_of_inc + irs_number: Series[str] = irs_number + files_10k: Series[bool] = files_10k class Sec10kOutputTable(pa.DataFrameModel): """Define table structure for output parents and subsidiaries table.""" + sec_company_id: Series[str] = sec_company_id + company_name: Series[str] = company_name + filename: Series[str] = filename + report_date: Series[pa.DateTime] = report_date + central_index_key: Series[str] = central_index_key + utility_id_eia: Series[int] = utility_id_eia + street_address: Series[str] = street_address + street_address_2: Series[str] = street_address_2 + city: Series[str] = city + state: Series[str] = state + company_name_raw: Series[str] = company_name_raw + date_of_name_change: Series[pa.DateTime] = date_of_name_change + former_conformed_name: Series[str] = former_conformed_name + standard_industrial_classification: Series[str] = standard_industrial_classification + state_of_incorporation: Series[str] = state_of_incorporation + location_of_inc: Series[str] = location_of_inc + irs_number: Series[str] = irs_number + files_10k: Series[bool] = files_10k parent_company_cik: Series[str] = pa.Field( description="CIK of the company's parent company.", nullable=True ) @@ -139,44 +170,21 @@ class Sec10kOutputTable(pa.DataFrameModel): class EiaCompanies(pa.DataFrameModel): """Define table structure for EIA owner and operator companies table.""" - company_name: Series[str] = pa.Field( - description="Cleaned name of the owner or operator company with legal terms expanded." - ) - street_address: Series[str] = pa.Field( - description="Street address of the company.", nullable=True - ) - street_address_2: Series[str] = pa.Field( - description="Secondary street address of the company.", nullable=True - ) - utility_id_eia: Series[int] = pa.Field( - description="EIA utility identifier for the company.", coerce=True - ) - company_name_raw: Series[str] = pa.Field(description="The raw company name.") - # TODO: What type for type expression? - report_date: Series[pa.DateTime] = pa.Field( - description="Report date of the record." - ) + company_name: Series[str] = company_name + street_address: Series[str] = street_address + street_address_2: Series[str] = street_address_2 + utility_id_eia: Series[int] = utility_id_eia + company_name_raw: Series[str] = company_name_raw + report_date: Series[pa.DateTime] = report_date report_year: Series[int] = pa.Field( description="Report year of the record.", coerce=True ) - city: Series[str] = pa.Field( - description="The city where the company is located.", nullable=True - ) - state: Series[str] = pa.Field( - description="Two letter state code where the company is located.", nullable=True - ) - zip_code: Series[str] = pa.Field( - description="5 digit zip code where the company is located.", nullable=True - ) - phone_number: Series[str] = pa.Field( - description="Phone number of company.", nullable=True - ) - company_name_no_legal: Series[str] = pa.Field( - description="Company name with legal terms stripped, e.g. LLC", nullable=True - ) - company_name_mphone: Series[str] = pa.Field( - description="Metaphone of the company name, could used for record linkage." - ) + city: Series[str] = city + state: Series[str] = state + zip_code: Series[str] = zip_code + phone_number: Series[str] = phone_number + company_name_no_legal: Series[str] = company_name_no_legal + company_name_mphone: Series[str] = company_name_mphone ex21_extract_type = pandera_schema_to_dagster_type(Ex21CompanyOwnership) From ef32facc043b835d6383eedbe46d1f0a559a42f0 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Tue, 21 Jan 2025 13:30:55 -0800 Subject: [PATCH 15/16] fix type checks --- src/mozilla_sec_eia/models/sec10k/entities.py | 31 ++++++++++--------- .../sec10k/ex_21/ex21_validation_helpers.py | 4 +++ .../transform_sec_input.py | 12 ++++++- .../sec_eia_validation_set.csv | 25 +++++++++++++++ 4 files changed, 57 insertions(+), 15 deletions(-) create mode 100644 src/mozilla_sec_eia/package_data/validation_data/sec_eia_validation_set.csv diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py index 3ad387b..d18222c 100644 --- a/src/mozilla_sec_eia/models/sec10k/entities.py +++ b/src/mozilla_sec_eia/models/sec10k/entities.py @@ -35,9 +35,11 @@ date_of_name_change = pa.Field( description="Date of last name change of the company.", nullable=True ) -former_conformed_name = pa.Field(description="Former name of the company.") +former_conformed_name = pa.Field( + description="Former name of the company.", nullable=True +) standard_industrial_classification = pa.Field( - description="The company's type of business." + description="The company's type of business.", nullable=True ) location_of_inc = pa.Field( description="Cleaned location of incorporation of the company.", nullable=True @@ -48,8 +50,14 @@ company_name_mphone = pa.Field( description="Metaphone of the company name, could be used for record linkage." ) -irs_number = pa.Field(description="ID of the company with the IRS.") +irs_number = pa.Field(description="ID of the company with the IRS.", nullable=True) files_10k = pa.Field(description="Indicates whether the company files a 10-K.") +#: Use str to avoid conversion errors +own_per = pa.Field( + description="Parent company's percent ownership of the company. String extracted from SEC 10-K Ex. 21 attachment.", + nullable=True, + coerce=True, +) class Ex21CompanyOwnership(pa.DataFrameModel): @@ -61,11 +69,7 @@ class Ex21CompanyOwnership(pa.DataFrameModel): description="Location of subsidiary company.", nullable=True ) #: Use str to avoid conversion errors - own_per: Series[str] = pa.Field( - description="Percent ownership of subsidiary company.", - nullable=True, - coerce=True, - ) + own_per: Series[str] = own_per class Basic10kCompanyInfo(pa.DataFrameModel): @@ -144,8 +148,10 @@ class Sec10kOutputTable(pa.DataFrameModel): company_name: Series[str] = company_name filename: Series[str] = filename report_date: Series[pa.DateTime] = report_date - central_index_key: Series[str] = central_index_key - utility_id_eia: Series[int] = utility_id_eia + central_index_key: Series[str] = pa.Field( + description="Identifier of the company in SEC database.", nullable=True + ) + utility_id_eia: Series[float] = utility_id_eia street_address: Series[str] = street_address street_address_2: Series[str] = street_address_2 city: Series[str] = city @@ -161,10 +167,7 @@ class Sec10kOutputTable(pa.DataFrameModel): parent_company_cik: Series[str] = pa.Field( description="CIK of the company's parent company.", nullable=True ) - own_per: Series[float] = pa.Field( - description="Parent company's ownership percentage of the company.", - nullable=True, - ) + own_per: Series[str] = own_per class EiaCompanies(pa.DataFrameModel): diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py index 002a027..edc562b 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py @@ -107,6 +107,10 @@ def clean_extracted_df(extracted_df): r"[^a-zA-Z&,\s]", "", regex=True ) if "own_per" in extracted_df.columns: + # enforce single decimal points + extracted_df["own_per"] = extracted_df["own_per"].str.replace( + r"\.+", ".", regex=True + ) # remove special chars and letters extracted_df["own_per"] = extracted_df["own_per"].str.replace( r"[^\d.]", "", regex=True diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index 930a29d..a8550cd 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -406,7 +406,17 @@ def out_sec_10k__parents_and_subsidiaries( logger.info( f"Ex. 21 subsidiary names matched to an EIA utility name: {len(ex21_non_filing_subs_df["utility_id_eia"].unique())}" ) - out_df = pd.concat([sec_10k_filers_matched_df, ex21_non_filing_subs_df]) + out_df = pd.concat( + [sec_10k_filers_matched_df, ex21_non_filing_subs_df] + ).reset_index(drop=True) + out_df = out_df.astype( + { + "report_date": "datetime64[ns]", + "utility_id_eia": "int64", + "date_of_name_change": "datetime64[ns]", + }, + errors="ignore", + ) return out_df diff --git a/src/mozilla_sec_eia/package_data/validation_data/sec_eia_validation_set.csv b/src/mozilla_sec_eia/package_data/validation_data/sec_eia_validation_set.csv new file mode 100644 index 0000000..01f12b5 --- /dev/null +++ b/src/mozilla_sec_eia/package_data/validation_data/sec_eia_validation_set.csv @@ -0,0 +1,25 @@ +central_index_key,utility_id_eia,sec_company_name,eia_company_name,match +3153,195,alabama power co,,1 +1868941,58702,"fluence energy, inc.",Fluence,0 +41091,7140,georgia power co,,1 +22198,4062,columbus southern power co /oh/,Columbus Southern Power Co,1 +1326160,5416,duke energy corp,,1 +30371,54905,"duke energy carolinas, llc",Duke Energy Carolinas LLC,1 +869446,57140,berkshire realty co inc /de,Berkshire Wind Power Cooperative Corp,0 +92122,18195,southern co,southern co services inc,0 +92122,17650,southern co,Southern Power Co,0 +75488,14328,pacific gas & electric co,,1 +1031296,6526,firstenergy corp,FirstEnergy,0 +1031296,54776,firstenergy corp,FirstEnergy Nuclear Generation Corp,0 +1031296,6458,firstenergy corp,First Energy Services,0 +1031296,32208,firstenergy corp,First Energy Corp,1 +100122,24211,tucson electric power co,,1 +96271,18454,tampa electric co,,1 +715957,5248,"dominion energy, inc",,1 +1013871,59883,"nrg energy, inc",NRG Energy Gas & Wind Holdings Inc,0 +1013871,13377,nrg energy inc,NRG Energy Inc,1 +788816,13994,oglethorpe power corp,,1 +18675,3266,central maine power co,,1 +1032208,61296,sempra energy,Sempra Generation,1 +4904,488,american electric power co inc,American Electric Power Inc,1 +715957,5248,"dominion energy, inc",Dominion Energy Inc.,1 From 678a54d20f4792c829d61a12ea7d2be5f363f85b Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 23 Jan 2025 10:32:25 -0500 Subject: [PATCH 16/16] Fix metadata concat --- .../sec_eia_record_linkage/transform_sec_input.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index a8550cd..0ee868d 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -73,9 +73,8 @@ def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFr """ sec_df = sec_df.merge(md[["filename", "date_filed"]], how="left", on=["filename"]) sec_df = sec_df.rename(columns={"date_filed": "report_date"}) - sec_df.loc[:, "report_year"] = ( - sec_df["report_date"].astype("datetime64[ns]").dt.year - ) + sec_df = sec_df.astype({"report_date": "datetime64[ns]"}) + sec_df.loc[:, "report_year"] = sec_df["report_date"].dt.year return sec_df @@ -264,7 +263,9 @@ def transformed_ex21_subsidiary_table( ) -> pd.DataFrame: """Transform Ex. 21 table of subsidiaries before combining with basic 10k table.""" ex21_df = pd.concat(ex21_dfs.values()) - sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) + sec10k_filing_metadata = pd.concat( + [df.reset_index() for df in sec10k_filing_metadata_dfs.values()] + ) ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df) ex21_df = ex21_df.drop(columns=["id"]) @@ -337,7 +338,9 @@ def transformed_basic_10k( to EIA utilities. """ basic_10k_df = pd.concat(basic_10k_dfs.values()) - sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) + sec10k_filing_metadata = pd.concat( + [df.reset_index() for df in sec10k_filing_metadata_dfs.values()] + ) basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata) out_df = basic_10k_df.fillna(np.nan).reset_index(names="record_id")