diff --git a/examples/atm/atm_clinvar.tsv b/examples/atm/atm_clinvar.tsv new file mode 100644 index 0000000..42f2999 --- /dev/null +++ b/examples/atm/atm_clinvar.tsv @@ -0,0 +1,35 @@ +rsid gene chromosome position ref alt clnrevstat clnsig clnvc +rs786203606 ATM 11 108227626 T C reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs746235533 ATM 11 108227691 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs747855862 ATM 11 108235669 G A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs876658159 ATM 11 108235805 G A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs772821016 ATM 11 108244873 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs876660485 ATM 11 108250861 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs1555070980 ATM 11 108250907 T G reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs772926890 ATM 11 108251073 G T reviewed_by_expert_panel Pathogenic single_nucleotide_variant + ATM 11 108257479 A T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs780619951 ATM 11 108259022 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs750663117 ATM 11 108272531 G A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs780240314 ATM 11 108272556 T G reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs1060501687 ATM 11 108272782 G T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs786201957 ATM 11 108279555 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs587776551 ATM 11 108281168 G A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs1060501551 ATM 11 108289683 A T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs1131691159 ATM 11 108299886 G C reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs587779844 ATM 11 108301698 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs1060501571 ATM 11 108301706 G A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs786204751 ATM 11 108304693 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs775036118 ATM 11 108307914 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs774925473 ATM 11 108309110 A G reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs2136011029 ATM 11 108310159 G C reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs1591789046 ATM 11 108317386 T A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs904589402 ATM 11 108319978 C G reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs587780638 ATM 11 108325544 G A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs587782403 ATM 11 108327643 A C reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs28904921 ATM 11 108329202 T G reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs371638537 ATM 11 108335959 A T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs778269655 ATM 11 108343260 G A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs587781363 ATM 11 108345797 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs1060501700 ATM 11 108347277 A C reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs17174393 ATM 11 108353881 G A reviewed_by_expert_panel Pathogenic single_nucleotide_variant +rs121434219 ATM 11 108365476 C T reviewed_by_expert_panel Pathogenic single_nucleotide_variant diff --git a/examples/atm/atm_dev.ipynb b/examples/atm/atm_dev.ipynb new file mode 100644 index 0000000..a7b1e77 --- /dev/null +++ b/examples/atm/atm_dev.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ATM\n", + "\n", + "This notebook shows how to develop a classifier with embedded tests in Jupyter." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from bioscript import optional_int, optional_str, write_tsv\n", + "from bioscript.classifier import GenotypeClassifier\n", + "from bioscript.types import VariantCall\n", + "from bioscript import assets_dir" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "ASSETS_DIR = assets_dir()\n", + "RESULT_HEADERS = [\n", + " \"participant_id\",\n", + " \"filename\",\n", + " \"gene\",\n", + " \"rsid\",\n", + " \"chromosome\",\n", + " \"position\",\n", + " \"genotype\",\n", + " \"ref\",\n", + " \"alt\",\n", + " \"variant_type\",\n", + " \"match_type\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:\n", + " \"\"\"Generate VariantCall objects from ClinVar DataFrame.\"\"\"\n", + " vcs: list[VariantCall] = []\n", + " for _, row in df.iterrows():\n", + " vcs.append(\n", + " VariantCall(\n", + " rsid=optional_str(row[\"rsid\"]),\n", + " ref=optional_str(row[\"ref\"]),\n", + " alt=optional_str(row[\"alt\"]),\n", + " chromosome=optional_str(row[\"chromosome\"]),\n", + " position=optional_int(row[\"position\"]),\n", + " gene=optional_str(row.get(\"gene\"), upper=True),\n", + " )\n", + " )\n", + " return vcs" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_vcs() -> list[VariantCall]:\n", + " \"\"\"Load ATM variant calls from ClinVar TSV files.\"\"\"\n", + " data_files = [ASSETS_DIR / name for name in [\"atm_clinvar.tsv\"]]\n", + " dfs = [pd.read_csv(f, sep=\"\\t\") for f in data_files]\n", + " df = pd.concat(dfs, ignore_index=True)\n", + " print(f\"Loaded {len(df)} variants from ATM and ATM\")\n", + " return generate_variant_calls(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class ATMClassifier(GenotypeClassifier):\n", + " def classify(self, matches):\n", + " \"\"\"Classify ATM variants and write results to TSV files.\"\"\"\n", + " if not matches.all_matches:\n", + " print(\"No variant matches were found.\", flush=True)\n", + "\n", + " # Get categorized matches as report rows\n", + " ref_rows, var_rows, no_rows = matches.categorize_report_rows(\n", + " self.participant_id, self.filename\n", + " )\n", + "\n", + " if self.debug:\n", + " write_tsv(f\"{self.output_basename}_ref.tsv\", ref_rows)\n", + " write_tsv(f\"{self.output_basename}_no.tsv\", no_rows)\n", + "\n", + " write_tsv(f\"{self.output_basename}.tsv\", var_rows, headers=RESULT_HEADERS)\n", + " \n", + " # Return variant rows for testing\n", + " return var_rows" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "__bioscript__ = {\n", + " \"variant_calls\": get_vcs,\n", + " \"classifier\": ATMClassifier,\n", + " \"name\": \"ATM\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tests\n", + "\n", + "Write tests using the test_* function convention:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# from bioscript import VariantFixture\n", + "# from bioscript.types import MatchList\n", + "# import os\n", + "\n", + "# # Create test fixtures for BRCA1 and BRCA2 variants\n", + "# fixture = VariantFixture(\n", + "# [\n", + "# {\"rsid\": \"rs80357336\", \"chromosome\": \"17\", \"position\": 43045711},\n", + "# {\"rsid\": \"rs886040303\", \"chromosome\": \"17\", \"position\": 43045728},\n", + "# {\"rsid\": \"rs397509295\", \"chromosome\": \"17\", \"position\": 43045729},\n", + "# {\"rsid\": \"rs80358650\", \"chromosome\": \"13\", \"position\": 32316463},\n", + "# {\"rsid\": \"rs397507571\", \"chromosome\": \"13\", \"position\": 32316470},\n", + "# {\"rsid\": \"rs80358622\", \"chromosome\": \"13\", \"position\": 32316497},\n", + "# ],\n", + "# assembly=\"GRCh38\",\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# def test_brca1_heterozygous_variants():\n", + "# \"\"\"Test detection of heterozygous BRCA1 variants.\"\"\"\n", + "# # Create test data with heterozygous variants (one alt allele)\n", + "# variants = fixture([\"GC\", \"GA\", \"GT\", \"GG\", \"GG\", \"GG\"])\n", + " \n", + "# # Create mini variant call list for testing\n", + "# test_vcs = [\n", + "# VariantCall(rsid=\"rs80357336\", ref=\"G\", alt=\"C\", chromosome=\"17\", position=43045711, gene=\"BRCA1\"),\n", + "# VariantCall(rsid=\"rs886040303\", ref=\"G\", alt=\"A\", chromosome=\"17\", position=43045728, gene=\"BRCA1\"),\n", + "# VariantCall(rsid=\"rs397509295\", ref=\"G\", alt=\"T\", chromosome=\"17\", position=43045729, gene=\"BRCA1\"),\n", + "# ]\n", + " \n", + "# matches = MatchList(variant_calls=test_vcs).match_rows(variants)\n", + "# classifier = BRCAClassifier(participant_id=\"TEST_HET\", name=\"BRCA\", filename=\"test.txt\")\n", + "# result = classifier(matches)\n", + " \n", + "# assert len(result) == 3, f\"Expected 3 variant rows, got {len(result)}\"\n", + "# assert all(row[\"gene\"] == \"BRCA1\" for row in result), \"All variants should be BRCA1\"\n", + "# assert all(row[\"match_type\"] == \"VARIANT_CALL\" for row in result), \"All should be variant calls\"\n", + " \n", + "# # Cleanup output file\n", + "# os.remove(\"result_BRCA_TEST_HET.tsv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# def test_brca2_homozygous_variant():\n", + "# \"\"\"Test detection of homozygous BRCA2 variant.\"\"\"\n", + "# # Create test data with one homozygous variant (two alt alleles)\n", + "# variants = fixture([\"GG\", \"GG\", \"GG\", \"AA\", \"GG\", \"GG\"])\n", + " \n", + "# test_vcs = [\n", + "# VariantCall(rsid=\"rs80358650\", ref=\"G\", alt=\"A\", chromosome=\"13\", position=32316463, gene=\"BRCA2\"),\n", + "# ]\n", + "\n", + "# matches = MatchList(variant_calls=test_vcs).match_rows(variants)\n", + "# classifier = BRCAClassifier(participant_id=\"TEST_HOM\", name=\"BRCA\", filename=\"test.txt\")\n", + "# result = classifier(matches)\n", + " \n", + "# assert len(result) == 1, f\"Expected 1 variant row, got {len(result)}\"\n", + "# assert result[0][\"gene\"] == \"BRCA2\", \"Variant should be BRCA2\"\n", + "# assert result[0][\"genotype\"] == \"AA\", \"Should be homozygous AA\"\n", + " \n", + "# # Cleanup output file\n", + "# os.remove(\"result_BRCA_TEST_HOM.tsv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# def test_no_variants():\n", + "# \"\"\"Test classifier with no matching variants.\"\"\"\n", + "# # All reference genotypes\n", + "# variants = fixture([\"GG\", \"GG\", \"GG\", \"GG\", \"GG\", \"GG\"])\n", + " \n", + "# test_vcs = [\n", + "# VariantCall(rsid=\"rs80357336\", ref=\"G\", alt=\"C\", chromosome=\"17\", position=43045711, gene=\"BRCA1\"),\n", + "# ]\n", + " \n", + "# matches = MatchList(variant_calls=test_vcs).match_rows(variants)\n", + "# classifier = BRCAClassifier(participant_id=\"TEST_REF\", name=\"BRCA\", filename=\"test.txt\")\n", + "# result = classifier(matches)\n", + " \n", + "# assert len(result) == 0, f\"Expected 0 variant rows, got {len(result)}\"\n", + " \n", + "# # Cleanup output file\n", + "# os.remove(\"result_BRCA_TEST_REF.tsv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Tests in Jupyter\n", + "\n", + "You can run tests directly in the notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# # Run tests\n", + "# test_brca1_heterozygous_variants()\n", + "# test_brca2_homozygous_variant()\n", + "# test_no_variants()\n", + "# print(\"✓ All tests passed!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export to Python Module\n", + "\n", + "Export this notebook to a Python file:\n", + "\n", + "```bash\n", + "bioscript export brca_dev.ipynb -o classify_brca.py\n", + "```\n", + "\n", + "Or in Python:\n", + "\n", + "```python\n", + "from bioscript import export_from_notebook\n", + "export_from_notebook(\"brca_dev.ipynb\", \"classify_brca.py\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('classify_atm.py')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bioscript import export_from_notebook\n", + "export_from_notebook(\"atm_dev.ipynb\", \"classify_atm.py\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "Testing: classify_atm.py\n", + "============================================================\n", + "Running tests with pytest: classify_atm.py\n", + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform darwin -- Python 3.12.7, pytest-8.4.2, pluggy-1.6.0 -- /Users/madhavajay/dev/bioscript/workspace1/.venv/bin/python3\n", + "cachedir: .pytest_cache\n", + "rootdir: /Users/madhavajay/dev/bioscript/workspace1/examples/atm\n", + "plugins: anyio-4.11.0\n", + "collected 0 items \u001b[0m\n", + "\n", + "\u001b[33m============================ \u001b[33mno tests ran\u001b[0m\u001b[33m in 0.03s\u001b[0m\u001b[33m =============================\u001b[0m\n" + ] + } + ], + "source": [ + "!bioscript test classify_atm.py" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BioVaultProject(name='atm-classifier', author='madhava@openmined.org', workflow='workflow.nf', template=, version='0.1.0', assets=['classify_atm.py', 'atm_clinvar.tsv'], parameters=[], inputs=[Input(name='participants', type='List[GenotypeRecord]', description='CSV/TSV with participant_id and genotype_file columns', format='csv', path=None, mapping={'participant_id': 'participant_id', 'genotype_file': 'genotype_file'}, cli_flag=None)], outputs=[Output(name='classification_result', type='File', description='ATM variant classification (aggregated)', format='tsv', path='result_ATM.tsv', cli_flag=None)], processes=[ProcessDefinition(name='atm_classifier', script='classify_atm.py', container='ghcr.io/openmined/bioscript:0.1.4', kind='bioscript')], docker_image='ghcr.io/openmined/bioscript:0.1.4', docker_platform='linux/amd64')" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bioscript import export_bioscript_workflow\n", + "\n", + "project = export_bioscript_workflow(\n", + " script_path='./classify_atm.py',\n", + " workflow_name='atm-classifier',\n", + " author='madhava@openmined.org',\n", + " target_dir='./',\n", + " assets={\n", + " \"atm_clinvar.tsv\",\n", + " },\n", + " inputs=[\n", + " {\n", + " 'name': 'participants',\n", + " 'type': 'List[GenotypeRecord]',\n", + " 'description': 'CSV/TSV with participant_id and genotype_file columns',\n", + " 'format': 'csv',\n", + " 'mapping': {\n", + " 'participant_id': 'participant_id',\n", + " 'genotype_file': 'genotype_file',\n", + " }\n", + " }\n", + " ],\n", + " outputs=[\n", + " {\n", + " 'name': 'classification_result',\n", + " 'type': 'File',\n", + " 'description': 'ATM variant classification (aggregated)',\n", + " 'format': 'tsv',\n", + " 'path': 'result_ATM.tsv',\n", + " },\n", + " ],\n", + ")\n", + "project\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BioVaultPipeline(name='atm-classifier', inputs={'samplesheet': 'List[GenotypeRecord]'}, steps=[PipelineStep(step_id='atm', uses='./', with_args={'participants': 'inputs.samplesheet'}, publish={'classification_result': 'File(result_ATM.tsv)'}, store={'counts_sql': SQLStore(source='classification_result', table_name='atm_{run_id}', destination='SQL()', participant_column='participant_id', key_column='participant_id')})])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bioscript import export_bioscript_pipeline, PipelineStep, SQLStore\n", + "\n", + "pipeline = export_bioscript_pipeline(\n", + " pipeline_name='atm-classifier',\n", + " target_dir='./atm-classifier',\n", + " inputs={\n", + " 'samplesheet': 'List[GenotypeRecord]',\n", + " },\n", + " steps=[\n", + " PipelineStep(\n", + " step_id='atm',\n", + " uses='./',\n", + " with_args={\n", + " 'participants': 'inputs.samplesheet',\n", + " },\n", + " publish={\n", + " 'classification_result': 'File(result_ATM.tsv)',\n", + " },\n", + " store={\n", + " 'counts_sql': SQLStore(\n", + " source='classification_result',\n", + " table_name='atm_{run_id}',\n", + " destination='SQL()',\n", + " key_column='participant_id',\n", + " ),\n", + " },\n", + " ),\n", + " ],\n", + ")\n", + "pipeline\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[bioscript] Current working directory: /Users/madhavajay/dev/bioscript/workspace1/examples/atm\n", + "[bioscript] Provided SNP file argument: carika.txt\n", + "[bioscript] Provided path absolute? False\n", + "[bioscript] Resolved SNP path: /Users/madhavajay/dev/bioscript/workspace1/examples/atm/carika.txt\n", + "[bioscript] Resolved exists? True\n", + "[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, __pycache__, atm-classifier, atm_clinvar.tsv, atm_dev.ipynb, carika.txt, classify_atm.py, result_ATM_X.tsv, result_ATM_X_no.tsv, result_ATM_X_ref.tsv\n", + "[bioscript] Using resolved SNP path: /Users/madhavajay/dev/bioscript/workspace1/examples/atm/carika.txt\n", + "Loaded 34 variants from ATM and ATM\n", + "participant_id=X\n", + "ATM_count=0\n" + ] + } + ], + "source": [ + "!bioscript classify classify_atm.py --file carika.txt --participant_id=\"X\" --debug" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[36m__pycache__\u001b[m\u001b[m \u001b[1m\u001b[36matm-classifier\u001b[m\u001b[m result_ATM_X_no.tsv\n", + "atm_clinvar.tsv carika.txt result_ATM_X_ref.tsv\n", + "atm_dev.ipynb classify_atm.py result_ATM_X.tsv\n" + ] + } + ], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "participant_id\tfilename\tgene\trsid\tchromosome\tposition\tgenotype\tref\talt\tvariant_type\tmatch_type\n" + ] + } + ], + "source": [ + "!cat result_ATM_X.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/2_filter.ipynb b/notebooks/2_filter.ipynb index 2e50405..52178ee 100644 --- a/notebooks/2_filter.ipynb +++ b/notebooks/2_filter.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "cd584ffd-b0ad-45d8-9c41-ea667b4f11bf", "metadata": {}, "outputs": [], @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "4985decc-0576-4762-b760-f19daad67855", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f3ea7e37-bcc4-4b07-a778-679669044935", "metadata": {}, "outputs": [], @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "42774c1a-7fc8-414a-a7e8-04eed400329a", "metadata": {}, "outputs": [], @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "0299df2b-bfab-4333-98cb-007f35b94820", "metadata": {}, "outputs": [], @@ -146,10 +146,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "1537a3b3-c7ed-4051-915d-5a6b30fe59b7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '11' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '12' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '13' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '14' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '15' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '16' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '17' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '18' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '20' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '21' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig '22' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'X' is not defined in the header. (Quick workaround: index the file with tabix.)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CLNSIG unique values: ['Affects', 'Benign', 'Benign/Likely_benign', 'Conflicting_classifications_of_pathogenicity', 'Established_risk_allele', 'Likely_benign', 'Likely_pathogenic', 'Likely_pathogenic,_low_penetrance', 'Likely_pathogenic/Likely_pathogenic,_low_penetrance', 'Likely_pathogenic/Likely_risk_allele', 'Likely_pathogenic/Pathogenic,_low_penetrance', 'Likely_risk_allele', 'Pathogenic', 'Pathogenic,_low_penetrance', 'Pathogenic/Likely_pathogenic', 'Pathogenic/Likely_pathogenic,_low_penetrance', 'Pathogenic/Likely_pathogenic/Likely_risk_allele', 'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance', 'Pathogenic/Likely_risk_allele', 'Pathogenic/Pathogenic,_low_penetrance', 'Uncertain_risk_allele', 'Uncertain_significance', 'Uncertain_significance/Uncertain_risk_allele', 'association', 'association_not_found', 'confers_sensitivity', 'drug_response', 'no_classification_for_the_single_variant', 'no_classifications_from_unflagged_records', 'not_provided', 'other', 'protective', 'risk_factor']\n", + "CLNREVSTAT unique values: ['criteria_provided,_conflicting_classifications', 'criteria_provided,_multiple_submitters,_no_conflicts', 'criteria_provided,_single_submitter', 'no_assertion_criteria_provided', 'no_classification_for_the_single_variant', 'no_classification_provided', 'no_classifications_from_unflagged_records', 'practice_guideline', 'reviewed_by_expert_panel']\n", + "CLNVC unique values: ['Deletion', 'Duplication', 'Indel', 'Insertion', 'Inversion', 'Microsatellite', 'Variation', 'single_nucleotide_variant']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'MT' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_113889.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187633.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187661.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NT_187693.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n", + "[W::vcf_parse] Contig 'NW_009646201.1' is not defined in the header. (Quick workaround: index the file with tabix.)\n" + ] + } + ], "source": [ "clnsigs, revstats, clnvc_values = collect_unique_clnsig_revstat_clnvc(f\"{DOWNLOAD_PATH}/clinvar.vcf\")\n", "\n", @@ -168,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "6d7f45d8-eb18-4afe-93d7-04d62738e4a8", "metadata": {}, "outputs": [], @@ -208,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "4b0bf4da-cd44-4176-bca6-58ba022f461e", "metadata": {}, "outputs": [], @@ -228,7 +280,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "c815a94a-9e21-4e54-a300-6150f7f6f033", "metadata": {}, "outputs": [], @@ -304,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "2f7eed10-8733-49fa-a321-3e3eaa390f37", "metadata": {}, "outputs": [], @@ -327,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "64703926-b7ab-43da-a154-b0d8f20ee4e9", "metadata": {}, "outputs": [], @@ -439,10 +491,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "413c2131-f9d4-4899-8826-fd96776c4404", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 503 BRCA1 variants\n" + ] + } + ], "source": [ "df_brca1 = clinvar_df_for_gene_filtered(\n", " vcf_path=vcf_path,\n", @@ -456,10 +516,276 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "a8042731-20a6-4663-ab28-2802f37081de", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSIDREFALTRSRS_prefixedCLNSIGGENEINFOCLNVCCLNREVSTATORIGINALLELEID
0174304571155630GC80357336rs80357336PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170297
1174304571155629GT80357336rs80357336PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170296
21743045728266562GA886040303rs886040303PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel1261566
3174304572955622GT397509295rs397509295PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170289
4174304573455620GA80356873rs80356873PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170287
..........................................
498174312404255638GA397509299rs397509299PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170305
499174312404437664AG80356929rs80356929PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel146220
500174312406354902GA80357134rs80357134PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel169569
501174312408955746AC397509332rs397509332PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170413
502174312409455072CA80357475rs80357475PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel169739
\n", + "

503 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CHROM POS ID REF ALT RS RS_prefixed CLNSIG \\\n", + "0 17 43045711 55630 G C 80357336 rs80357336 Pathogenic \n", + "1 17 43045711 55629 G T 80357336 rs80357336 Pathogenic \n", + "2 17 43045728 266562 G A 886040303 rs886040303 Pathogenic \n", + "3 17 43045729 55622 G T 397509295 rs397509295 Pathogenic \n", + "4 17 43045734 55620 G A 80356873 rs80356873 Pathogenic \n", + ".. ... ... ... .. .. ... ... ... \n", + "498 17 43124042 55638 G A 397509299 rs397509299 Pathogenic \n", + "499 17 43124044 37664 A G 80356929 rs80356929 Pathogenic \n", + "500 17 43124063 54902 G A 80357134 rs80357134 Pathogenic \n", + "501 17 43124089 55746 A C 397509332 rs397509332 Pathogenic \n", + "502 17 43124094 55072 C A 80357475 rs80357475 Pathogenic \n", + "\n", + " GENEINFO CLNVC CLNREVSTAT ORIGIN \\\n", + "0 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "1 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "2 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "3 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "4 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + ".. ... ... ... ... \n", + "498 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "499 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "500 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "501 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "502 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "\n", + " ALLELEID \n", + "0 70297 \n", + "1 70296 \n", + "2 261566 \n", + "3 70289 \n", + "4 70287 \n", + ".. ... \n", + "498 70305 \n", + "499 46220 \n", + "500 69569 \n", + "501 70413 \n", + "502 69739 \n", + "\n", + "[503 rows x 13 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_brca1" ] @@ -474,7 +800,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "d3bb4475-8736-4306-91b6-103feeaf3ea1", "metadata": {}, "outputs": [], @@ -519,37 +845,172 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "59746fa3-0992-4f21-b03c-8dff8c6d0ed3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLNREVSTATcount
0reviewed_by_expert_panel503
\n", + "
" + ], + "text/plain": [ + " CLNREVSTAT count\n", + "0 reviewed_by_expert_panel 503" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "count_clnrevstat(df_brca1)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "63a76827-0e22-42bb-9a7f-da227caf5597", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLNSIGcount
0Pathogenic503
\n", + "
" + ], + "text/plain": [ + " CLNSIG count\n", + "0 Pathogenic 503" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "count_clnsig(df_brca1)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "821150c0-6ee7-4950-ade6-363b2f796906", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLNVCcount
0single_nucleotide_variant503
\n", + "
" + ], + "text/plain": [ + " CLNVC count\n", + "0 single_nucleotide_variant 503" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "count_clnvc(df_brca1)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "7cceea3b-960e-47b6-b057-526ed529182d", "metadata": {}, "outputs": [], @@ -582,17 +1043,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "9cdf0988-c79f-45db-874c-fcb6fe0f6529", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== CLNREVSTAT (Review Status) ===\n", + " CLNREVSTAT count\n", + "reviewed_by_expert_panel 503\n", + "\n", + "\n", + "=== CLNSIG (Clinical Significance) ===\n", + " CLNSIG count\n", + "Pathogenic 503\n", + "\n", + "\n", + "=== CLNVC (Variant Class) ===\n", + " CLNVC count\n", + "single_nucleotide_variant 503\n", + "\n", + "\n" + ] + } + ], "source": [ "summarize_clinvar_counts(df_brca1)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "e040046b-98a0-4fc9-b4f1-b213aa10cbf9", "metadata": {}, "outputs": [], @@ -682,10 +1165,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "0055838d-c80a-4a17-9688-662e75686dc3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 503 BRCA1 variants (now including Duplications)\n" + ] + } + ], "source": [ "df_brca1 = clinvar_df_for_gene_filtered(\n", " vcf_path=vcf_path,\n", @@ -699,40 +1190,388 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "aabf5576-5f3f-408c-b5ee-13e7f6f5b7bd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSIDREFALTRSRS_prefixedCLNSIGGENEINFOCLNVCCLNREVSTATORIGINALLELEID
0174304571155630GC80357336rs80357336PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170297
1174304571155629GT80357336rs80357336PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170296
21743045728266562GA886040303rs886040303PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel1261566
3174304572955622GT397509295rs397509295PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170289
4174304573455620GA80356873rs80356873PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170287
..........................................
498174312404255638GA397509299rs397509299PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170305
499174312404437664AG80356929rs80356929PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel146220
500174312406354902GA80357134rs80357134PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel169569
501174312408955746AC397509332rs397509332PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel170413
502174312409455072CA80357475rs80357475PathogenicBRCA1:672single_nucleotide_variantreviewed_by_expert_panel169739
\n", + "

503 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CHROM POS ID REF ALT RS RS_prefixed CLNSIG \\\n", + "0 17 43045711 55630 G C 80357336 rs80357336 Pathogenic \n", + "1 17 43045711 55629 G T 80357336 rs80357336 Pathogenic \n", + "2 17 43045728 266562 G A 886040303 rs886040303 Pathogenic \n", + "3 17 43045729 55622 G T 397509295 rs397509295 Pathogenic \n", + "4 17 43045734 55620 G A 80356873 rs80356873 Pathogenic \n", + ".. ... ... ... .. .. ... ... ... \n", + "498 17 43124042 55638 G A 397509299 rs397509299 Pathogenic \n", + "499 17 43124044 37664 A G 80356929 rs80356929 Pathogenic \n", + "500 17 43124063 54902 G A 80357134 rs80357134 Pathogenic \n", + "501 17 43124089 55746 A C 397509332 rs397509332 Pathogenic \n", + "502 17 43124094 55072 C A 80357475 rs80357475 Pathogenic \n", + "\n", + " GENEINFO CLNVC CLNREVSTAT ORIGIN \\\n", + "0 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "1 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "2 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "3 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "4 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + ".. ... ... ... ... \n", + "498 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "499 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "500 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "501 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "502 BRCA1:672 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "\n", + " ALLELEID \n", + "0 70297 \n", + "1 70296 \n", + "2 261566 \n", + "3 70289 \n", + "4 70287 \n", + ".. ... \n", + "498 70305 \n", + "499 46220 \n", + "500 69569 \n", + "501 70413 \n", + "502 69739 \n", + "\n", + "[503 rows x 13 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_brca1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "6e44c741-c628-4c43-95c0-dd31a96ff754", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[clinvar] exported 503 rows to ./work/brca1_clinvar.tsv\n" + ] + } + ], "source": [ "export_df = export_clinvar_tsv(df_brca1, \"./work/brca1_clinvar.tsv\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "9ab5eeb9-571f-4eac-8ac6-86111589aecd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== CLNREVSTAT (Review Status) ===\n", + " CLNREVSTAT count\n", + "reviewed_by_expert_panel 503\n", + "\n", + "\n", + "=== CLNSIG (Clinical Significance) ===\n", + " CLNSIG count\n", + "Pathogenic 503\n", + "\n", + "\n", + "=== CLNVC (Variant Class) ===\n", + " CLNVC count\n", + "single_nucleotide_variant 503\n", + "\n", + "\n" + ] + } + ], "source": [ "summarize_clinvar_counts(df_brca1)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "83714e50-70d1-4006-9078-248229ad8340", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSIDREFALTRSRS_prefixedCLNSIGGENEINFOCLNVCCLNREVSTATORIGINALLELEID
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [CHROM, POS, ID, REF, ALT, RS, RS_prefixed, CLNSIG, GENEINFO, CLNVC, CLNREVSTAT, ORIGIN, ALLELEID]\n", + "Index: []" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# 1) Either REF or ALT has length > 1\n", "df_brca1[(df_brca1['REF'].str.len().gt(1).fillna(False)) | (df_brca1['ALT'].str.len().gt(1).fillna(False))]" @@ -740,10 +1579,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "3decd44d-44c4-44ca-a53b-071529eae15e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSIDREFALTRSRS_prefixedCLNSIGGENEINFOCLNVCCLNREVSTATORIGINALLELEID
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [CHROM, POS, ID, REF, ALT, RS, RS_prefixed, CLNSIG, GENEINFO, CLNVC, CLNREVSTAT, ORIGIN, ALLELEID]\n", + "Index: []" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# 2) Both > 1 AND the same length\n", "df_brca1[\n", @@ -763,10 +1654,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "e18f2941-4a68-47ff-829c-57aadbf43458", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 503 BRCA1 variants (now including Duplications)\n" + ] + } + ], "source": [ "df_brca2 = clinvar_df_for_gene_filtered(\n", " vcf_path=vcf_path,\n", @@ -780,20 +1679,338 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "767d36a7-a3f9-431c-a9b3-5948eecb107b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSIDREFALTRSRS_prefixedCLNSIGGENEINFOCLNVCCLNREVSTATORIGINALLELEID
0133231646351579GA80358650rs80358650PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel166247
1133231647051063GT397507571rs397507571PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel165731
2133231649751527GT80358622rs80358622PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel166195
3133231652852161GT81002796rs81002796PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel166829
4133231908052285TA397507902rs397507902PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel166953
..........................................
6191332398252267170CT886040849rs886040849PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel1261555
6201332398349267174TA886040852rs886040852PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel1261558
621133239839652911CT80359247rs80359247PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel367579
6221332398437267177CA4987049rs4987049PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel1261561
623133239843752916CG4987049rs4987049PathogenicBRCA2:675single_nucleotide_variantreviewed_by_expert_panel167584
\n", + "

624 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CHROM POS ID REF ALT RS RS_prefixed CLNSIG \\\n", + "0 13 32316463 51579 G A 80358650 rs80358650 Pathogenic \n", + "1 13 32316470 51063 G T 397507571 rs397507571 Pathogenic \n", + "2 13 32316497 51527 G T 80358622 rs80358622 Pathogenic \n", + "3 13 32316528 52161 G T 81002796 rs81002796 Pathogenic \n", + "4 13 32319080 52285 T A 397507902 rs397507902 Pathogenic \n", + ".. ... ... ... .. .. ... ... ... \n", + "619 13 32398252 267170 C T 886040849 rs886040849 Pathogenic \n", + "620 13 32398349 267174 T A 886040852 rs886040852 Pathogenic \n", + "621 13 32398396 52911 C T 80359247 rs80359247 Pathogenic \n", + "622 13 32398437 267177 C A 4987049 rs4987049 Pathogenic \n", + "623 13 32398437 52916 C G 4987049 rs4987049 Pathogenic \n", + "\n", + " GENEINFO CLNVC CLNREVSTAT ORIGIN \\\n", + "0 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "1 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "2 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "3 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "4 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + ".. ... ... ... ... \n", + "619 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "620 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "621 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 3 \n", + "622 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "623 BRCA2:675 single_nucleotide_variant reviewed_by_expert_panel 1 \n", + "\n", + " ALLELEID \n", + "0 66247 \n", + "1 65731 \n", + "2 66195 \n", + "3 66829 \n", + "4 66953 \n", + ".. ... \n", + "619 261555 \n", + "620 261558 \n", + "621 67579 \n", + "622 261561 \n", + "623 67584 \n", + "\n", + "[624 rows x 13 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_brca2" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "c7e6ef31-0dd9-4dfa-a456-68607b33015b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSIDREFALTRSRS_prefixedCLNSIGGENEINFOCLNVCCLNREVSTATORIGINALLELEID
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [CHROM, POS, ID, REF, ALT, RS, RS_prefixed, CLNSIG, GENEINFO, CLNVC, CLNREVSTAT, ORIGIN, ALLELEID]\n", + "Index: []" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# 2) Both > 1 AND the same length\n", "df_brca2[\n", @@ -805,20 +2022,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "94b2913b-979e-408b-b5f7-be93d14b0410", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== CLNREVSTAT (Review Status) ===\n", + " CLNREVSTAT count\n", + "reviewed_by_expert_panel 624\n", + "\n", + "\n", + "=== CLNSIG (Clinical Significance) ===\n", + " CLNSIG count\n", + "Pathogenic 624\n", + "\n", + "\n", + "=== CLNVC (Variant Class) ===\n", + " CLNVC count\n", + "single_nucleotide_variant 624\n", + "\n", + "\n" + ] + } + ], "source": [ "summarize_clinvar_counts(df_brca2)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "865b4784-5ea5-47b6-af60-fb12de70d60c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[clinvar] exported 624 rows to ./work/brca2_clinvar.tsv\n" + ] + } + ], "source": [ "export_df = export_clinvar_tsv(df_brca2, \"./work/brca2_clinvar.tsv\")" ] @@ -1217,9 +2464,762 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "73856a12-509a-4eb6-a3ae-1e773e1d4989", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 503 ATM variants (now including Duplications)\n" + ] + } + ], + "source": [ + "df_atm = clinvar_df_for_gene_filtered(\n", + " vcf_path=vcf_path,\n", + " gene_name=\"ATM\",\n", + " allowed_types=[\"single_nucleotide_variant\"],\n", + " allowed_clnsig=[\"Pathogenic\"],\n", + " allowed_revstat=[\"practice_guideline\", \"reviewed_by_expert_panel\"]\n", + ")\n", + "print(f\"Found {len(df_brca1)} ATM variants (now including Duplications)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "e8a55c16-ff93-44b9-9e21-8867277e1713", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CHROMPOSIDREFALTRSRS_prefixedCLNSIGGENEINFOCLNVCCLNREVSTATORIGINALLELEID
011108227626187275TC786203606rs786203606PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1183068
111108227691232248CT746235533rs746235533PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel3233887
211108235669231535GA747855862rs747855862PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1233910
311108235805634428GA876658159rs876658159PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel5622397
411108244873216024CT772821016rs772821016PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1212837
511108250861233553CT876660485rs876660485PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1233978
611108250907453367TG1555070980rs1555070980PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1461289
711108251073220555GT772926890rs772926890PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1222042
8111082574794056347ATNoneNonePathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel14170285
911108259022216021CT780619951rs780619951PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel19212851
1011108272531231277GA750663117rs750663117PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1234071
1111108272556556315TG780240314rs780240314PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1545982
1211108272782407699GT1060501687rs1060501687PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1397887
1311108279555185137CT786201957rs786201957PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1183231
14111082811683035GA587776551rs587776551PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel118074
1511108289683407482AT1060501551rs1060501551PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1397772
1611108299886857860GC1131691159rs1131691159PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1852591
1711108301698127403CT587779844rs587779844PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1132860
1811108301706407510GA1060501571rs1060501571PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1398353
1911108304693189177CT786204751rs786204751PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel1186801
2011108307914482526CT775036118rs775036118PathogenicATM:472single_nucleotide_variantreviewed_by_expert_panel3475938
21111083091103021AG774925473rs774925473PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel118060
22111083101592780229GC2136011029rs2136011029PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel12943970
2311108317386826252TA1591789046rs1591789046PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1810562
24111083199781713223CG904589402rs904589402PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel11770735
2511108325544135775GA587780638rs587780638PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1139487
2611108327643142355AC587782403rs587782403PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1152069
27111083292023023TG28904921rs28904921PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1118062
2811108335959135780AT371638537rs371638537PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel17139492
2911108343260189104GA778269655rs778269655PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1186804
3011108345797140907CT587781363rs587781363PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1150621
3111108347277407718AC1060501700rs1060501700PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1398370
3211108353881127463GA17174393rs17174393PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1132920
33111083654763029CT121434219rs121434219PathogenicATM:472|C11orf65:160140single_nucleotide_variantreviewed_by_expert_panel1918068
\n", + "
" + ], + "text/plain": [ + " CHROM POS ID REF ALT RS RS_prefixed CLNSIG \\\n", + "0 11 108227626 187275 T C 786203606 rs786203606 Pathogenic \n", + "1 11 108227691 232248 C T 746235533 rs746235533 Pathogenic \n", + "2 11 108235669 231535 G A 747855862 rs747855862 Pathogenic \n", + "3 11 108235805 634428 G A 876658159 rs876658159 Pathogenic \n", + "4 11 108244873 216024 C T 772821016 rs772821016 Pathogenic \n", + "5 11 108250861 233553 C T 876660485 rs876660485 Pathogenic \n", + "6 11 108250907 453367 T G 1555070980 rs1555070980 Pathogenic \n", + "7 11 108251073 220555 G T 772926890 rs772926890 Pathogenic \n", + "8 11 108257479 4056347 A T None None Pathogenic \n", + "9 11 108259022 216021 C T 780619951 rs780619951 Pathogenic \n", + "10 11 108272531 231277 G A 750663117 rs750663117 Pathogenic \n", + "11 11 108272556 556315 T G 780240314 rs780240314 Pathogenic \n", + "12 11 108272782 407699 G T 1060501687 rs1060501687 Pathogenic \n", + "13 11 108279555 185137 C T 786201957 rs786201957 Pathogenic \n", + "14 11 108281168 3035 G A 587776551 rs587776551 Pathogenic \n", + "15 11 108289683 407482 A T 1060501551 rs1060501551 Pathogenic \n", + "16 11 108299886 857860 G C 1131691159 rs1131691159 Pathogenic \n", + "17 11 108301698 127403 C T 587779844 rs587779844 Pathogenic \n", + "18 11 108301706 407510 G A 1060501571 rs1060501571 Pathogenic \n", + "19 11 108304693 189177 C T 786204751 rs786204751 Pathogenic \n", + "20 11 108307914 482526 C T 775036118 rs775036118 Pathogenic \n", + "21 11 108309110 3021 A G 774925473 rs774925473 Pathogenic \n", + "22 11 108310159 2780229 G C 2136011029 rs2136011029 Pathogenic \n", + "23 11 108317386 826252 T A 1591789046 rs1591789046 Pathogenic \n", + "24 11 108319978 1713223 C G 904589402 rs904589402 Pathogenic \n", + "25 11 108325544 135775 G A 587780638 rs587780638 Pathogenic \n", + "26 11 108327643 142355 A C 587782403 rs587782403 Pathogenic \n", + "27 11 108329202 3023 T G 28904921 rs28904921 Pathogenic \n", + "28 11 108335959 135780 A T 371638537 rs371638537 Pathogenic \n", + "29 11 108343260 189104 G A 778269655 rs778269655 Pathogenic \n", + "30 11 108345797 140907 C T 587781363 rs587781363 Pathogenic \n", + "31 11 108347277 407718 A C 1060501700 rs1060501700 Pathogenic \n", + "32 11 108353881 127463 G A 17174393 rs17174393 Pathogenic \n", + "33 11 108365476 3029 C T 121434219 rs121434219 Pathogenic \n", + "\n", + " GENEINFO CLNVC \\\n", + "0 ATM:472 single_nucleotide_variant \n", + "1 ATM:472 single_nucleotide_variant \n", + "2 ATM:472 single_nucleotide_variant \n", + "3 ATM:472 single_nucleotide_variant \n", + "4 ATM:472 single_nucleotide_variant \n", + "5 ATM:472 single_nucleotide_variant \n", + "6 ATM:472 single_nucleotide_variant \n", + "7 ATM:472 single_nucleotide_variant \n", + "8 ATM:472 single_nucleotide_variant \n", + "9 ATM:472 single_nucleotide_variant \n", + "10 ATM:472 single_nucleotide_variant \n", + "11 ATM:472 single_nucleotide_variant \n", + "12 ATM:472 single_nucleotide_variant \n", + "13 ATM:472 single_nucleotide_variant \n", + "14 ATM:472 single_nucleotide_variant \n", + "15 ATM:472 single_nucleotide_variant \n", + "16 ATM:472 single_nucleotide_variant \n", + "17 ATM:472 single_nucleotide_variant \n", + "18 ATM:472 single_nucleotide_variant \n", + "19 ATM:472 single_nucleotide_variant \n", + "20 ATM:472 single_nucleotide_variant \n", + "21 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "22 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "23 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "24 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "25 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "26 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "27 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "28 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "29 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "30 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "31 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "32 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "33 ATM:472|C11orf65:160140 single_nucleotide_variant \n", + "\n", + " CLNREVSTAT ORIGIN ALLELEID \n", + "0 reviewed_by_expert_panel 1 183068 \n", + "1 reviewed_by_expert_panel 3 233887 \n", + "2 reviewed_by_expert_panel 1 233910 \n", + "3 reviewed_by_expert_panel 5 622397 \n", + "4 reviewed_by_expert_panel 1 212837 \n", + "5 reviewed_by_expert_panel 1 233978 \n", + "6 reviewed_by_expert_panel 1 461289 \n", + "7 reviewed_by_expert_panel 1 222042 \n", + "8 reviewed_by_expert_panel 1 4170285 \n", + "9 reviewed_by_expert_panel 19 212851 \n", + "10 reviewed_by_expert_panel 1 234071 \n", + "11 reviewed_by_expert_panel 1 545982 \n", + "12 reviewed_by_expert_panel 1 397887 \n", + "13 reviewed_by_expert_panel 1 183231 \n", + "14 reviewed_by_expert_panel 1 18074 \n", + "15 reviewed_by_expert_panel 1 397772 \n", + "16 reviewed_by_expert_panel 1 852591 \n", + "17 reviewed_by_expert_panel 1 132860 \n", + "18 reviewed_by_expert_panel 1 398353 \n", + "19 reviewed_by_expert_panel 1 186801 \n", + "20 reviewed_by_expert_panel 3 475938 \n", + "21 reviewed_by_expert_panel 1 18060 \n", + "22 reviewed_by_expert_panel 1 2943970 \n", + "23 reviewed_by_expert_panel 1 810562 \n", + "24 reviewed_by_expert_panel 1 1770735 \n", + "25 reviewed_by_expert_panel 1 139487 \n", + "26 reviewed_by_expert_panel 1 152069 \n", + "27 reviewed_by_expert_panel 11 18062 \n", + "28 reviewed_by_expert_panel 17 139492 \n", + "29 reviewed_by_expert_panel 1 186804 \n", + "30 reviewed_by_expert_panel 1 150621 \n", + "31 reviewed_by_expert_panel 1 398370 \n", + "32 reviewed_by_expert_panel 1 132920 \n", + "33 reviewed_by_expert_panel 19 18068 " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_atm" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "fa39484a-c4c3-4c7c-a3fe-4dc6ac1c4a3b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[clinvar] exported 34 rows to ./work/atm_clinvar.tsv\n" + ] + } + ], + "source": [ + "export_df = export_clinvar_tsv(df_atm, \"./work/atm_clinvar.tsv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "322d2c3b-9523-4f67-bee0-c3c4398cf85b", + "metadata": {}, "outputs": [], "source": [] }