diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 4d33b300..f888d490 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -687,13 +687,14 @@ async def filter_ordinance_docs( model_configs[LLMTasks.DEFAULT], ), ) + sources_as_str = "\n\t- ".join( + [doc.attrs.get("source", "Unknown source") for doc in docs] + ) logger.info( - "%d document(s) remaining after jurisdiction filter for %s\n\t- %s", + "%d document(s) remaining after jurisdiction filter for %s %s", len(docs), jurisdiction.full_name, - "\n\t- ".join( - [doc.attrs.get("source", "Unknown source") for doc in docs] - ), + f"\n\t- {sources_as_str}" if sources_as_str else "", ) COMPASS_PB.update_jurisdiction_task( diff --git a/compass/utilities/jurisdictions.py b/compass/utilities/jurisdictions.py index 1a305385..e728b79d 100644 --- a/compass/utilities/jurisdictions.py +++ b/compass/utilities/jurisdictions.py @@ -3,7 +3,8 @@ import logging from warnings import warn import importlib.resources -from functools import cached_property +from functools import cached_property, lru_cache +import unicodedata import numpy as np import pandas as pd @@ -125,6 +126,28 @@ def full_name_the_prefixed(self): return self.full_name + @cached_property + def short_name_with_state(self): + """str: Comma-separated short jurisdiction name""" + if self.subdivision_name: + name_parts = [self.full_subdivision_phrase] + else: + name_parts = [self.full_county_phrase] + + name_parts.append(self.state) + return ", ".join(filter(None, name_parts)) + + @cached_property + def short_name_with_state_the_prefixed(self): + """str: Short jurisdiction name maybe prefixed with ``the``""" + if not self.subdivision_name: + return self.short_name_with_state + + if self.type.casefold() in _JURISDICTION_TYPES_AS_PREFIXES: + return f"the {self.short_name_with_state}" + + return self.short_name_with_state + @cached_property def full_subdivision_phrase(self): """str: Subdivision phrase for the jurisdiction or empty str""" @@ -189,8 +212,16 @@ def load_all_jurisdiction_info(): Notes ----- Missing values are normalized to ``None`` to simplify downstream - serialization. + serialization. A shallow copy is returned on each call so callers + can safely mutate their local view without affecting the cached + source data. """ + return _load_all_jurisdiction_info_cached().copy() + + +@lru_cache(maxsize=1) +def _load_all_jurisdiction_info_cached(): + """Load and cache canonical jurisdiction metadata""" return pd.concat( pd.read_csv(fp).replace({np.nan: None}) for fp in KNOWN_JURISDICTIONS_REGISTRY @@ -227,6 +258,62 @@ def jurisdiction_websites(jurisdiction_info=None): } +def load_jurisdictions_from_subdivision_names( + subdivision_names, state=None, jurisdiction_info=None +): + """Load known jurisdictions matching subdivision names + + Parameters + ---------- + subdivision_names : str or iterable of str + One or more subdivision names to match against the canonical + jurisdiction data. + state : str, optional + Optional state name used to filter matching jurisdictions. + Matching uses the same normalized comparison applied to the + subdivision names. By default, ``None``. + jurisdiction_info : pandas.DataFrame, optional + DataFrame containing jurisdiction info. If ``None``, this info + is loaded using :func:`load_all_jurisdiction_info`. By default, + ``None``. + + Returns + ------- + pandas.DataFrame + Rows from the canonical jurisdiction data whose subdivision + names match the requested names. + """ + if jurisdiction_info is None: + jurisdiction_info = load_all_jurisdiction_info() + + if subdivision_names is None: + subdivision_names = [] + elif isinstance(subdivision_names, str): + subdivision_names = [subdivision_names] + + normalized_names = { + normalized_name + for name in subdivision_names + if (normalized_name := _normalize_jurisdiction_name(name)) + } + if not normalized_names: + return jurisdiction_info.iloc[0:0].copy() # empty df + + subdivision_mask = ( + jurisdiction_info["Subdivision"] + .map(_normalize_jurisdiction_name) + .isin(normalized_names) + ) + if state is not None: + normalized_state = _normalize_jurisdiction_name(state) + subdivision_mask &= ( + jurisdiction_info["State"].map(_normalize_jurisdiction_name) + == normalized_state + ) + + return jurisdiction_info[subdivision_mask].reset_index(drop=True) + + def load_jurisdictions_from_fp(jurisdiction_fp): """Load jurisdiction metadata for entries listed in a CSV file @@ -395,3 +482,12 @@ def _format_jurisdiction_df_for_output(df): def _build_merge_col(row, merge_cols): """Build column to merge jurisdiction DataFrames on""" return " ".join(str(row[c]).casefold() for c in merge_cols) + + +def _normalize_jurisdiction_name(name): + """Normalize jurisdiction names for resilient comparisons""" + if name is None or pd.isna(name): + return "" + + normalized_name = unicodedata.normalize("NFKD", str(name).casefold()) + return "".join(char for char in normalized_name if char.isalnum()) diff --git a/compass/validation/graphs.py b/compass/validation/graphs.py index f66029d2..89e635e7 100644 --- a/compass/validation/graphs.py +++ b/compass/validation/graphs.py @@ -5,6 +5,9 @@ llm_response_starts_with_yes, llm_response_starts_with_no, ) +from compass.utilities.jurisdictions import ( + load_jurisdictions_from_subdivision_names, +) def setup_graph_correct_document_type(**kwargs): @@ -193,6 +196,10 @@ def setup_graph_correct_document_type(**kwargs): "(e.g., 'Appendix,' 'Form,' or 'Application Template') as indicators " "of an unfinished draft. Many finalized ordinances and regulations " "include such templates for public or administrative use.\n" + "* Do **not** treat blank fields, sequences of underscores, blank " + "lines, or other similar placeholders for **dates** or **signatures** " + "as the sole indicator of draft status. Other signals must be present " + "in order to treat the document as a draft.\n" "\nFocus instead on signs of incompleteness or active " "editing, such as (but not limited to):\n" '* explicit labels: "DRAFT", "DRAFT VERSION", "NOT FINAL", "FOR ' @@ -314,6 +321,7 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): ), ) + jur_name = jurisdiction.full_name_the_prefixed names_we_want = _jurisdiction_names_to_extract(jurisdiction) G.add_edge("init", "has_name", condition=llm_response_starts_with_yes) @@ -355,15 +363,14 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): prompt=( "Based on the legal text, is there clear and specific " "evidence that the ordinance applies specifically to " - f"**{jurisdiction.full_name_the_prefixed}**? This could " + f"**{jur_name}**? This could " f"include a direct mention of **{jurisdiction.state}**, a " "title, heading, or citation indicating it's an ordinance for " f"{jurisdiction.state} state, or other language that " f"reasonably ties the text to {jurisdiction.full_name} " "specifically. Generic references such as 'the state' or " "'State Zoning Administrator' are not sufficient on their own " - "unless clearly linked to " - f"{jurisdiction.full_name_the_prefixed}. " + f"unless clearly linked to {jur_name}. " "{YES_NO_PROMPT}" ), ) @@ -409,7 +416,7 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): prompt=( "Based on the legal text, is there clear and specific " "evidence that the ordinance applies specifically to " - f"**{jurisdiction.full_name_the_prefixed}**? This could " + f"**{jur_name}**? This could " f"include a direct mention of **{jurisdiction.county}**, " "a title, heading, or citation indicating it's an " f"ordinance for {jurisdiction.county} " @@ -419,7 +426,7 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): f"{jurisdiction.type.casefold()}' or " f"'{jurisdiction.type} Zoning Administrator' are not " "sufficient on their own unless clearly linked to " - f"{jurisdiction.full_name_the_prefixed}. " + f"{jur_name}. " "{YES_NO_PROMPT}" ), ) @@ -431,9 +438,6 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): node_to_connect = "is_county" if jurisdiction.subdivision_name: - # TODO: check known jurisdictions to see if duplicate names - # exist in the same state. If not, don't include county name in - # phrase G.add_edge( node_to_connect, "is_subdivision", @@ -466,23 +470,30 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): "has_subdivision_name", condition=llm_response_starts_with_yes, ) + + num_other_jurisdictions_with_same_name = len( + load_jurisdictions_from_subdivision_names( + jurisdiction.subdivision_name, state=jurisdiction.state + ) + ) + if num_other_jurisdictions_with_same_name > 1: + jur_name = jurisdiction.short_name_with_state_the_prefixed + G.add_node( "has_subdivision_name", prompt=( "Based on the legal text, is there clear and specific " "evidence that the ordinance applies specifically to " - f"**{jurisdiction.full_name_the_prefixed}**? This could " - "include a direct mention of " + f"**{jur_name}**? This could include a direct mention of " f"**{jurisdiction.subdivision_name}**, " "a title, heading, or citation indicating it's an ordinance " f"for {jurisdiction.full_subdivision_phrase_the_prefixed}, " "or other language that reasonably ties the text to " - f"{jurisdiction.full_name_the_prefixed} specifically. " - "Generic references such as 'the " + f"{jur_name} specifically. Generic references such as 'the " f"{jurisdiction.type.casefold()}' or " f"'{jurisdiction.type} Zoning Administrator' are not " "sufficient on their own unless clearly linked to " - f"{jurisdiction.full_name_the_prefixed}. " + f"{jur_name}. " "{YES_NO_PROMPT}" ), ) @@ -498,8 +509,7 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): "'correct_jurisdiction' key should be a boolean that is set to " "`true` **only if** it is reasonable to conclude that the " "provisions within apply to the entire area (i.e. " - f"{jurisdiction.type.casefold()}-wide) governed by " - f"**{jurisdiction.full_name_the_prefixed}** " + f"{jurisdiction.type.casefold()}-wide) governed by **{jur_name}** " "(`false` otherwise). The value of the 'explanation' key should " "be a string containing a brief explanation for your choice. " ), @@ -551,7 +561,16 @@ def setup_graph_correct_jurisdiction_from_url(jurisdiction, **kwargs): node_to_connect = "init" keys_to_collect = {"correct_state": f"{jurisdiction.state} state"} - if jurisdiction.county: + should_check_county = bool(jurisdiction.county) + if should_check_county and jurisdiction.subdivision_name: + num_other_jurisdictions_with_same_name = len( + load_jurisdictions_from_subdivision_names( + jurisdiction.subdivision_name, state=jurisdiction.state + ) + ) + should_check_county &= num_other_jurisdictions_with_same_name > 1 + + if should_check_county: G.add_edge( node_to_connect, "mentions_county", diff --git a/compass/validation/location.py b/compass/validation/location.py index cf8b8ac4..8697c804 100644 --- a/compass/validation/location.py +++ b/compass/validation/location.py @@ -6,9 +6,10 @@ import asyncio import logging - +from urllib.parse import urlsplit from compass.llm.calling import BaseLLMCaller, ChatLLMCaller, LLMCaller +from compass.utilities.jurisdictions import jurisdiction_websites from compass.common import setup_async_decision_tree, run_async_tree from compass.validation.graphs import ( setup_graph_correct_jurisdiction_type, @@ -86,6 +87,15 @@ async def check(self, url): if not url: return False + if _url_matches_known_jurisdiction_website(url, self.jurisdiction): + logger.debug( + "Skipping URL jurisdiction LLM check for %s because its " + "domain matches the canonical website for %s", + url, + self.jurisdiction, + ) + return True + chat_llm_caller = ChatLLMCaller( llm_service=self.llm_service, system_message=self.SYSTEM_MESSAGE, @@ -459,3 +469,34 @@ def _weighted_vote(out, raw_pages, doc_source): weights = max(weights, 1) return total / weights, num_verdicts + + +def _url_matches_known_jurisdiction_website(url, jurisdiction): + """Return whether URL domain matches canonical website""" + known_website = _known_jurisdiction_website(jurisdiction) + if not known_website: + return False + + url_domain = _normalize_domain(url) + known_domain = _normalize_domain(known_website) + if not url_domain or not known_domain: + return False + + return url_domain == known_domain or url_domain.endswith( + f".{known_domain}" + ) + + +def _known_jurisdiction_website(jurisdiction): + """Return a canonical website URL for a jurisdiction if available""" + if jurisdiction.website_url: + return jurisdiction.website_url + return jurisdiction_websites().get(jurisdiction.code) + + +def _normalize_domain(url): + """Return a comparable domain string for a URL or empty string""" + domain = urlsplit(url).netloc.partition(":")[0].casefold().strip() + if domain.startswith("www."): + return domain[4:] + return domain diff --git a/tests/python/unit/utilities/test_utilities_jurisdictions.py b/tests/python/unit/utilities/test_utilities_jurisdictions.py index b30f3e8d..8581b6f0 100644 --- a/tests/python/unit/utilities/test_utilities_jurisdictions.py +++ b/tests/python/unit/utilities/test_utilities_jurisdictions.py @@ -9,6 +9,7 @@ from compass.utilities.jurisdictions import ( load_all_jurisdiction_info, load_jurisdictions_from_fp, + load_jurisdictions_from_subdivision_names, jurisdiction_websites, jurisdictions_from_df, Jurisdiction, @@ -53,6 +54,21 @@ def test_load_all_jurisdictions(): assert "Rhode Island" in set(jurisdiction_info["State"]) +def test_load_all_jurisdictions_returns_shallow_copy(): + """Test cached jurisdiction info is returned as a caller-safe copy""" + + jurisdiction_info = load_all_jurisdiction_info() + county_col = jurisdiction_info.columns.get_loc("County") + original_county = jurisdiction_info.iat[0, county_col] + + jurisdiction_info.iat[0, county_col] = "Modified County" + + fresh_jurisdiction_info = load_all_jurisdiction_info() + + assert fresh_jurisdiction_info is not jurisdiction_info + assert fresh_jurisdiction_info.iat[0, county_col] == original_county + + def test_jurisdiction_websites(): """Test the `jurisdiction_websites` function""" @@ -66,6 +82,82 @@ def test_jurisdiction_websites(): assert 49003 in websites # Box Elder, Utah +def test_load_jurisdictions_from_subdivision_names_safe_matching(): + """Test subdivision lookup with normalization across symbols/spaces""" + + jurisdiction_info = pd.DataFrame( + { + "County": ["Apache", "Weld", "Rio Arriba", None, None], + "State": [ + "Arizona", + "Colorado", + "New Mexico", + "Texas", + "Texas", + ], + "Subdivision": [ + "St. Johns", + "St Johns", + "St-Johns", + "Barton Springs/Edwards", + None, + ], + "Jurisdiction Type": [ + "city", + "town", + "village", + "Aquifer Conservation District", + "county", + ], + "FIPS": [4001, 8001, 35039, 2, 48453], + "Website": [ + "https://stjohnsaz.gov", + "https://stjohnsco.gov", + "https://stjohnsnm.gov", + "https://www.bseacd.org", + "https://traviscountytx.gov", + ], + } + ) + + jurisdictions = load_jurisdictions_from_subdivision_names( + [" st johns ", "barton_springs edwards"], + jurisdiction_info=jurisdiction_info, + ) + + assert set(jurisdictions["FIPS"]) == {4001, 8001, 35039, 2} + assert None not in set(jurisdictions["Subdivision"]) + + +def test_load_jurisdictions_from_subdivision_names_state_filter(): + """Test subdivision lookup can be narrowed to a normalized state""" + + jurisdiction_info = pd.DataFrame( + { + "County": ["Apache", "Weld", "Rio Arriba"], + "State": ["Arizona", "Colorado", "New Mexico"], + "Subdivision": ["St. Johns", "St Johns", "St-Johns"], + "Jurisdiction Type": ["city", "town", "village"], + "FIPS": [4001, 8001, 35039], + "Website": [ + "https://stjohnsaz.gov", + "https://stjohnsco.gov", + "https://stjohnsnm.gov", + ], + } + ) + + jurisdictions = load_jurisdictions_from_subdivision_names( + "st johns", + state="new-mexico", + jurisdiction_info=jurisdiction_info, + ) + + assert len(jurisdictions) == 1 + assert jurisdictions.iloc[0]["State"] == "New Mexico" + assert jurisdictions.iloc[0]["FIPS"] == 35039 + + def test_load_jurisdictions_from_fp(tmp_path): """Test `load_jurisdictions_from_fp` function""" @@ -398,6 +490,51 @@ def test_full_name_the_prefixed_property(): ) +def test_short_name_with_state_property(): + """Test ``Jurisdiction.short_name_with_state`` property""" + + state = Jurisdiction("state", state="Colorado") + assert state.short_name_with_state == "Colorado" + + county = Jurisdiction("county", state="Colorado", county="Jefferson") + assert county.short_name_with_state == "Jefferson County, Colorado" + + city = Jurisdiction( + "city", state="Colorado", county="Jefferson", subdivision_name="Golden" + ) + assert city.short_name_with_state == "City of Golden, Colorado" + + +def test_short_name_with_state_the_prefixed_property(): + """Test ``Jurisdiction.short_name_with_state_the_prefixed`` property""" + + state = Jurisdiction("state", state="Colorado") + assert state.short_name_with_state_the_prefixed == "Colorado" + + county = Jurisdiction("county", state="Colorado", county="Jefferson") + assert county.short_name_with_state_the_prefixed == ( + "Jefferson County, Colorado" + ) + + city = Jurisdiction( + "city", state="Colorado", county="Jefferson", subdivision_name="Golden" + ) + assert ( + city.short_name_with_state_the_prefixed + == "the City of Golden, Colorado" + ) + + parish = Jurisdiction( + "parish", + state="Louisiana", + county="Assumption", + subdivision_name="Napoleonville", + ) + assert parish.short_name_with_state_the_prefixed == ( + "Napoleonville Parish, Louisiana" + ) + + def test_full_subdivision_phrase_the_prefixed_property(): """Test ``Jurisdiction.full_subdivision_phrase_the_prefixed`` property""" diff --git a/tests/python/unit/validation/test_validation_graphs.py b/tests/python/unit/validation/test_validation_graphs.py index 9e99bb50..4a4bc7f7 100644 --- a/tests/python/unit/validation/test_validation_graphs.py +++ b/tests/python/unit/validation/test_validation_graphs.py @@ -214,6 +214,44 @@ def test_setup_graph_correct_jurisdiction_from_url_city(): ) graph = setup_graph_correct_jurisdiction_from_url(loc) + assert set(graph.nodes) == { + "init", + "mentions_city", + "final", + } + assert list(graph.edges) == [ + ("init", "mentions_city"), + ("mentions_city", "final"), + ] + + assert f"{loc.state} state" in graph.nodes["init"]["prompt"] + + assert ( + f"the {loc.full_subdivision_phrase}" + in graph.nodes["mentions_city"]["prompt"] + ) + + assert "correct_state" in graph.nodes["final"]["prompt"] + assert f"{loc.state} state" in graph.nodes["final"]["prompt"] + + assert "correct_county" not in graph.nodes["final"]["prompt"] + assert loc.full_county_phrase not in graph.nodes["final"]["prompt"] + + assert "correct_city" in graph.nodes["final"]["prompt"] + assert loc.full_subdivision_phrase in graph.nodes["final"]["prompt"] + + +def test_setup_graph_correct_duplicate_jurisdiction_from_url_city(): + """Test setting up URL jurisdiction validation graph for city""" + + loc = Jurisdiction( + "borough", + state="Pennsylvania", + county="Columbia", + subdivision_name="Ashland", # multiple of these in PA + ) + graph = setup_graph_correct_jurisdiction_from_url(loc) + assert set(graph.nodes) == { "init", "mentions_county", @@ -245,7 +283,7 @@ def test_setup_graph_correct_jurisdiction_from_url_city(): assert "correct_county" in graph.nodes["final"]["prompt"] assert loc.full_county_phrase in graph.nodes["final"]["prompt"] - assert "correct_city" in graph.nodes["final"]["prompt"] + assert "correct_borough" in graph.nodes["final"]["prompt"] assert loc.full_subdivision_phrase in graph.nodes["final"]["prompt"] diff --git a/tests/python/unit/validation/test_validation_location.py b/tests/python/unit/validation/test_validation_location.py index 5e7650e8..3328578b 100644 --- a/tests/python/unit/validation/test_validation_location.py +++ b/tests/python/unit/validation/test_validation_location.py @@ -1,5 +1,6 @@ """Test COMPASS Ordinance location validation tests""" +import asyncio import os from pathlib import Path @@ -9,6 +10,8 @@ from elm.utilities.parse import read_pdf_ocr from compass.utilities.jurisdictions import Jurisdiction +from compass.utilities.jurisdictions import jurisdictions_from_df +from compass.utilities.jurisdictions import load_all_jurisdiction_info from compass.validation.location import ( JurisdictionValidator, DTreeJurisdictionValidator, @@ -86,6 +89,103 @@ async def test_url_matches_county(oai_llm_service, loc, url, truth): assert out == truth +@pytest.mark.asyncio +async def test_url_matches_known_jurisdiction_website_skips_llm(monkeypatch): + """Test URL validation passes when canonical website domain matches""" + jurisdiction_info = load_all_jurisdiction_info() + jurisdiction = next( + jur + for jur in jurisdictions_from_df(jurisdiction_info) + if jur.website_url + ) + website_url = jurisdiction.website_url + jurisdiction = Jurisdiction( + jurisdiction.type, + state=jurisdiction.state, + county=jurisdiction.county, + subdivision_name=jurisdiction.subdivision_name, + code=jurisdiction.code, + ) + url = f"{website_url.rstrip('/')}/ordinances/test.pdf" + + async def _should_not_run(*args, **kwargs): + await asyncio.sleep(0) + raise AssertionError("LLM validation should have been skipped") + + monkeypatch.setattr( + "compass.validation.location.run_async_tree", + _should_not_run, + ) + + url_validator = DTreeURLJurisdictionValidator( + jurisdiction, llm_service=object() + ) + assert await url_validator.check(url) + + +@pytest.mark.asyncio +async def test_http_url_matches_known_jurisdiction_website_skips_llm( + monkeypatch, +): + """Test URL validation passes when canonical website domain matches""" + jurisdiction = Jurisdiction( + "county", + state="Indiana", + county="Decatur", + website_url="https://www.decaturcounty.in.gov", + ) + url = "https://www.decaturcounty.in.gov/ordinances/test.pdf" + + async def _should_not_run(*args, **kwargs): + await asyncio.sleep(0) + raise AssertionError("LLM validation should have been skipped") + + monkeypatch.setattr( + "compass.validation.location.run_async_tree", + _should_not_run, + ) + + url_validator = DTreeURLJurisdictionValidator( + jurisdiction, llm_service=object() + ) + assert await url_validator.check(url) + + +@pytest.mark.asyncio +async def test_url_mismatch_falls_back_to_llm(monkeypatch): + """Test URL validation still uses the LLM when domains differ""" + jurisdiction = Jurisdiction( + "county", + state="Indiana", + county="Decatur", + website_url="https://www.decaturcounty.in.gov", + ) + calls = {"count": 0} + + def _fake_tree(*args, **kwargs): + return object() + + async def _fake_run_async_tree(tree, response_as_json=True): + await asyncio.sleep(0) + calls["count"] += 1 + return {"correct_jurisdiction": True} + + monkeypatch.setattr( + "compass.validation.location.setup_async_decision_tree", + _fake_tree, + ) + monkeypatch.setattr( + "compass.validation.location.run_async_tree", + _fake_run_async_tree, + ) + + url_validator = DTreeURLJurisdictionValidator( + jurisdiction, llm_service=object() + ) + assert await url_validator.check("https://example.com/doc.pdf") + assert calls["count"] == 1 + + @flaky(max_runs=3, min_passes=1) @pytest.mark.skipif(SHOULD_SKIP, reason="requires Azure OpenAI key") @pytest.mark.asyncio