Skip to content

Commit 3ac8308

Browse files
feat: citation generator and updated pipeline
1 parent d72455d commit 3ac8308

7 files changed

Lines changed: 612 additions & 551 deletions

File tree

data/annotations/PMC11730665.json

Lines changed: 115 additions & 106 deletions
Large diffs are not rendered by default.

data/annotations/PMC5728534.json

Lines changed: 82 additions & 405 deletions
Large diffs are not rendered by default.

notebooks/table_generation.ipynb

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -255,11 +255,11 @@
255255
],
256256
"source": [
257257
"# Test the new PharmacogenomicTableGenerator component\n",
258-
"from src.components.pharmacogenomic_table import PharmacogenomicTableGenerator\n",
258+
"from src.components.annotation_table import AnnotationTableGenerator\n",
259259
"import json\n",
260260
"\n",
261261
"# Initialize the generator\n",
262-
"table_generator = PharmacogenomicTableGenerator(pmcid=\"PMC11730665\", model=\"gpt-4.1\")\n",
262+
"table_generator = AnnotationTableGenerator(pmcid=\"PMC11730665\", model=\"gpt-4.1\")\n",
263263
"\n",
264264
"# Generate structured JSON directly\n",
265265
"print(\"=== Generating structured JSON directly ===\")\n",
@@ -273,8 +273,54 @@
273273
]
274274
},
275275
{
276-
"cell_type": "markdown",
276+
"cell_type": "code",
277+
"execution_count": 14,
278+
"metadata": {},
279+
"outputs": [
280+
{
281+
"name": "stderr",
282+
"output_type": "stream",
283+
"text": [
284+
"Generating 1 Responses: 0%| | 0/1 [00:00<?, ?it/s]\u001b[32m2025-07-29 10:09:48.996\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msrc.article_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m43\u001b[0m - \u001b[1mGetting article text from PMCID: PMC5728534\u001b[0m\n",
285+
"\u001b[32m2025-07-29 10:09:48.999\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msrc.article_parser\u001b[0m:\u001b[36mremove_references_section\u001b[0m:\u001b[36m90\u001b[0m - \u001b[1mRemoved References section from article text\u001b[0m\n",
286+
"Generating 1 Responses: 100%|██████████| 1/1 [00:11<00:00, 11.16s/it]"
287+
]
288+
},
289+
{
290+
"name": "stdout",
291+
"output_type": "stream",
292+
"text": [
293+
"| Gene | Polymorphism | Relationship/Effect | p-value |\n",
294+
"|------|-------------|---------------------|---------|\n",
295+
"| OCT1 (SLC22A1) | OCT1*5, OCT1*6, OCT1*12, OCT1*13 | These alleles completely lacked ranitidine uptake (loss of function for ranitidine transport). | Not explicitly stated, but effect is described as complete loss. |\n",
296+
"| OCT1 (SLC22A1) | OCT1*2, OCT1*3, OCT1*4, OCT1*10 | These alleles had vmax values for ranitidine uptake decreased by more than 50% compared to reference. | *OCT1*2: p<0.001; *OCT1*3: p<0.001; *OCT1*4: p<0.001; *OCT1*10: p<0.05 (from Table 2) |\n",
297+
"| OCT1 (SLC22A1) | OCT1*8 | This allele showed an increase of vmax for ranitidine uptake by 25% (not statistically significant). | P = 0.5 (not significant) |\n",
298+
"| OCT1 (SLC22A1) | OCT1*1A, OCT1*1C, OCT1*1D, OCT1*7, OCT1*9, OCT1*11 | These alleles showed no significant difference in ranitidine uptake compared to the reference allele. | Not significant (exact values not given) |\n",
299+
"| OCT1 (SLC22A1) | OCT1*2 (Met420del) | Ranitidine was on average two-fold more potent in inhibiting morphine uptake in the OCT1*2 allele compared to the reference allele (IC50 for morphine uptake: 19.5 μM for OCT1*2 vs 45.5 μM for reference). | Not explicitly stated, but difference is described as significant. |\n",
300+
"| OCT2 (SLC22A2) | Ala270Ser | OCT2 showed only a limited uptake of ranitidine that was not significantly affected by the Ala270Ser polymorphism (9% reduction, not significant). | Not significant (exact value not given) |\n"
301+
]
302+
},
303+
{
304+
"name": "stderr",
305+
"output_type": "stream",
306+
"text": [
307+
"\n"
308+
]
309+
}
310+
],
311+
"source": [
312+
"from src.components.annotation_table import AnnotationTableGenerator\n",
313+
"import json\n",
314+
"table_gen_2 = AnnotationTableGenerator(pmcid=\"PMC5728534\", model=\"gpt-4.1\")\n",
315+
"\n",
316+
"table_gen_2.print_table_markdown(table_gen_2.generate_table_json())"
317+
]
318+
},
319+
{
320+
"cell_type": "code",
321+
"execution_count": null,
277322
"metadata": {},
323+
"outputs": [],
278324
"source": []
279325
}
280326
],

pixi.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ download-articles = "python -m src.fetch_articles.article_downloader"
1818
download-data = "gdown --fuzzy https://drive.google.com/file/d/1qtQWvi0x_k5_JofgrfsgkWzlIdb6isr9/view && unzip autogkb-data.zip && rm autogkb-data.zip"
1919
setup-repo = "pixi install && pixi run download-data"
2020
copy-markdown = "python -m src.copy_markdown"
21+
annotation-pipeline = "python -m src.components.annotation_pipeline"
2122

2223
[dependencies]
2324
seaborn = ">=0.13.2,<0.14"
Lines changed: 25 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,49 @@
1-
from src.components.all_associations import get_all_associations, AssociationType
2-
from src.components.drug_annotation import get_drug_annotation
3-
from src.components.phenotype_annotation import get_phenotype_annotation
4-
from src.components.functional_annotation import get_functional_annotation
1+
from src.components.annotation_table import AnnotationTableGenerator
2+
from src.components.citation_generator import CitationGenerator
53
from src.components.study_parameters import get_study_parameters
64
from src.utils import get_article_text, is_pmcid, get_title
7-
from typing import Optional
85
from loguru import logger
96
from pathlib import Path
107
import os
118

129
class AnnotationPipeline:
13-
def __init__(self, pmcid: str):
10+
def __init__(self, pmcid: str, citation_approach: str = "lm"):
1411
if not is_pmcid(pmcid):
1512
logger.error(f"Invalid PMCID: {pmcid}")
1613
self.pmcid = pmcid
14+
self.citation_approach = citation_approach
1715
self.article_text = get_article_text(pmcid)
1816
self.title = get_title(self.article_text)
19-
self.all_associations = []
2017
self.study_parameters = {}
21-
self.drug_annotations = []
22-
self.phenotye_annotations = []
23-
self.functional_annotations = []
18+
self.annotations = None
2419

2520
def print_info(self):
26-
logger.info(f"Found {len(self.all_associations)} associations")
27-
logger.info(f"Created {len(self.drug_annotations)} Drug Annotations")
28-
logger.info(f"Created {len(self.phenotye_annotations)} Phenotype Annotations")
29-
logger.info(
30-
f"Created {len(self.functional_annotations)} Functional Annotations"
31-
)
21+
annotation_count = len(self.annotations.relationships) if self.annotations else 0
22+
23+
logger.info(f"Created {annotation_count} Annotations")
3224

3325
def generate_final_structure(self):
3426
return {
3527
"pmcid": self.pmcid,
3628
"title": self.title,
3729
"study_parameters": self.study_parameters,
38-
"drug_annotations": self.drug_annotations,
39-
"phenotype_annotations": self.phenotye_annotations,
40-
"functional_annotations": self.functional_annotations,
30+
"annotations": self.annotations,
4131
}
4232

4333
def run(self, save_path: str = "data/annotations"):
4434
logger.info("Getting Study Parameters")
4535
self.study_parameters = get_study_parameters(self.article_text)
4636

47-
logger.info("Getting All Associations")
48-
self.all_associations = get_all_associations(self.article_text)
37+
# Generate annotations using AnnotationTableGenerator
38+
annotation_generator = AnnotationTableGenerator(self.pmcid)
39+
40+
logger.info("Generating Annotations")
41+
self.annotations = annotation_generator.generate_table_json()
4942

50-
for association in self.all_associations:
51-
if association.association_type == AssociationType.DRUG:
52-
self.drug_annotations.append(get_drug_annotation(association))
53-
if association.association_type == AssociationType.PHENOTYPE:
54-
self.phenotye_annotations.append(get_phenotype_annotation(association))
55-
if association.association_type == AssociationType.FUNCTIONAL:
56-
self.functional_annotations.append(
57-
get_functional_annotation(association)
58-
)
43+
# Generate citations for annotations
44+
citation_generator = CitationGenerator(self.pmcid, approach=self.citation_approach)
45+
logger.info(f"Adding Citations to Annotations using {self.citation_approach} approach")
46+
self.annotations = citation_generator.add_citations_to_annotations(self.annotations)
5947

6048
self.print_info()
6149

@@ -89,14 +77,14 @@ def copy_markdown(pmcid: str):
8977
if __name__ == "__main__":
9078
pmcids = [
9179
"PMC5728534",
92-
# "PMC11730665",
93-
# "PMC5712579",
94-
# "PMC4737107",
95-
# "PMC5749368"
80+
"PMC11730665",
81+
"PMC5712579",
82+
"PMC4737107",
83+
"PMC5749368"
9684
]
9785
for pmcid in pmcids:
9886
logger.info(f"Processing {pmcid}")
99-
pipeline = AnnotationPipeline(pmcid)
87+
pipeline = AnnotationPipeline(pmcid, citation_approach="local")
10088
pipeline.run()
101-
# for pmcid in pmcids:
102-
# copy_markdown(pmcid)
89+
for pmcid in pmcids:
90+
copy_markdown(pmcid)

src/components/annotation_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ class AnnotationRelationship(BaseModel):
99
polymorphism: str = Field(description="Genetic polymorphism/variant")
1010
relationship_effect: str = Field(description="Relationship or effect description")
1111
p_value: str = Field(description="Statistical p-value")
12+
citations: List[str] = Field(default=[], description="List of supporting sentences from the text")
1213

1314

1415
class AnnotationTable(BaseModel):

0 commit comments

Comments
 (0)