diff --git a/CHANGELOG.md b/CHANGELOG.md index 72868f7e..abff0188 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **`currency` column on the XBRL facts DataFrame** — `xbrl().facts.to_dataframe()` now includes a `currency` column with each fact's ISO 4217 code (e.g. `USD`, `HKD`) resolved from its unit measure. Non-USD filers tag monetary facts with opaque unit ids such as `UNIT_STANDARD_HKD_MNUSOXGRF0O9R60JINVDUQ`, which were exposed verbatim in `unit_ref` and made currency-based filtering and display unreliable; the raw `unit_ref` is preserved, while `currency` gives a usable code. Per-share monetary units report their numerator currency, and non-monetary units (shares, pure, custom) resolve to `None` rather than a misleading value. ([#850](https://github.com/dgunning/edgartools/issues/850)) + ### Fixed - **TTM Q4 derivation no longer produces wrong/negative values for discrete-quarter reporters** — when a concept is reported as discrete quarters with no cumulative 9-month YTD fact (common for BDCs and investment companies), `TTMCalculator` derives Q4 as `FY - (Q1+Q2+Q3)`. It previously selected the three input quarters by their `fiscal_period` label, but the SEC tags comparative facts in re-filings with the *filing's* fiscal period, so the same calendar quarter could appear labeled Q1, Q2 and Q3 across successive 10-Qs — producing a wrong, often negative Q4 (e.g. GAIN `InvestmentCompanyDividendDistribution`: `57.2M - 3×28.8M = -29.2M`). Quarters are now selected by distinct calendar period (dedup by `period_end`, latest periodic filing wins), and derivation is skipped when a discrete Q4 is already reported. This affects `quarterize()`, TTM calculations, and quarterly statement views. ([#848](https://github.com/dgunning/edgartools/issues/848)) diff --git a/edgar/xbrl/facts.py b/edgar/xbrl/facts.py index a344d81b..2d7f34d2 100644 --- a/edgar/xbrl/facts.py +++ b/edgar/xbrl/facts.py @@ -43,6 +43,36 @@ def _deduplicate_facts(df: pd.DataFrame) -> pd.DataFrame: return df +def _iso4217_code(measure: Optional[str]) -> Optional[str]: + """Return the ISO 4217 code of an ``iso4217:`` unit measure, else ``None``.""" + if measure and measure.startswith('iso4217:'): + return measure[len('iso4217:'):] + return None + + +def _unit_currency(unit_info: Optional[Dict[str, Any]]) -> Optional[str]: + """Resolve a parsed XBRL unit to its ISO 4217 currency code, or ``None``. + + Currency facts use a simple ``iso4217:`` measure (e.g. ``iso4217:HKD`` -> + ``HKD``). Per-share monetary facts use a ``divide`` unit whose numerator is + the currency (e.g. ``iso4217:USD`` per ``xbrli:shares``), so the numerator + currency is returned. Non-monetary units (shares, pure, ...) return ``None``. + The opaque ``unit_ref`` id itself (e.g. ``UNIT_STANDARD_HKD_...``) is never + parsed -- only the resolved measure is used (see issue #850). + """ + if not unit_info: + return None + unit_type = unit_info.get('type') + if unit_type == 'simple': + return _iso4217_code(unit_info.get('measure')) + if unit_type == 'divide': + for measure in unit_info.get('numerator', []): + code = _iso4217_code(measure) + if code: + return code + return None + + class FactQuery: """ A query builder for XBRL facts that enables filtering by various attributes. @@ -1005,6 +1035,10 @@ def get_facts(self) -> List[Dict[str, Any]]: # Build enriched facts from raw facts, contexts, and elements enriched_facts = [] + # Resolve each fact's opaque unit_ref to its ISO 4217 currency once + # (e.g. "UNIT_STANDARD_HKD_..." -> "HKD"); see issue #850. + units = self.xbrl.units + for fact_key, fact in self.xbrl._facts.items(): # Create a dict with only necessary fields instead of full model_dump fact_dict = { @@ -1014,6 +1048,7 @@ def get_facts(self) -> List[Dict[str, Any]]: 'context_ref': fact.context_ref, 'value': fact.value, 'unit_ref': fact.unit_ref, + 'currency': _unit_currency(units.get(fact.unit_ref)) if fact.unit_ref else None, 'decimals': fact.decimals, 'numeric_value': fact.numeric_value } diff --git a/tests/issues/regression/test_issue_850.py b/tests/issues/regression/test_issue_850.py new file mode 100644 index 00000000..8a1c6575 --- /dev/null +++ b/tests/issues/regression/test_issue_850.py @@ -0,0 +1,87 @@ +""" +Regression test for Issue #850: Normalize currency unit identifiers to ISO 4217. + +Problem: ``xbrl().facts.to_dataframe()`` exposed the raw XBRL ``unit_ref`` id for +every fact. For non-USD filers the unit id is an opaque token such as +``UNIT_STANDARD_HKD_MNUSOXGRF0O9R60JINVDUQ`` instead of a usable currency, which +made currency-based filtering and display unreliable for foreign companies. + +Fix: the facts DataFrame now carries a ``currency`` column that resolves each +fact's unit to its ISO 4217 code (e.g. ``HKD``) via the parsed unit measure +(``iso4217:HKD``). The opaque ``unit_ref`` is preserved unchanged; non-monetary +units (shares, pure, custom) resolve to ``None`` rather than a misleading value. + +Reporter: warzoo +See: https://github.com/dgunning/edgartools/issues/850 +""" + +from pathlib import Path + +from edgar.xbrl import XBRL + +# A minimal XBRL instance whose currency unit uses the opaque id from the issue. +FOREIGN_INSTANCE = """ + + + 0001234567 + 2024-01-012024-12-31 + + + iso4217:HKD + + 1500000 + +""" + + +def test_unit_currency_helper(): + """``_unit_currency`` extracts the ISO 4217 code, ignoring non-currency units.""" + from edgar.xbrl.facts import _unit_currency + + assert _unit_currency({"type": "simple", "measure": "iso4217:HKD"}) == "HKD" + assert _unit_currency({"type": "simple", "measure": "iso4217:USD"}) == "USD" + # Per-share monetary units (currency / shares) report the numerator currency. + assert _unit_currency({"type": "divide", "numerator": ["iso4217:USD"], "denominator": ["xbrli:shares"]}) == "USD" + # Non-monetary or unknown units resolve to None, not a misleading value. + assert _unit_currency({"type": "simple", "measure": "shares"}) is None + assert _unit_currency({"type": "simple", "measure": "xbrli:pure"}) is None + assert _unit_currency(None) is None + + +def test_opaque_unit_ref_resolves_to_iso4217_currency(tmp_path): + """The issue's repro: an opaque HKD unit id resolves to ``HKD`` in ``currency``.""" + instance_file = tmp_path / "foreign.xml" + instance_file.write_text(FOREIGN_INSTANCE) + + xbrl = XBRL.from_files(instance_file=instance_file) + df = xbrl.facts.to_dataframe() + + assert "currency" in df.columns + revenue = df[df["concept"].str.contains("Revenue", case=False, na=False)] + assert len(revenue) == 1 + # The opaque unit_ref is preserved unchanged ... + assert revenue["unit_ref"].iloc[0] == "UNIT_STANDARD_HKD_MNUSOXGRF0O9R60JINVDUQ" + # ... and the currency is now the usable ISO 4217 code. + assert revenue["currency"].iloc[0] == "HKD" + + +def test_currency_column_ground_truth_aapl(): + """A real filing (AAPL 10-K) resolves monetary facts to their currency codes.""" + aapl = XBRL.from_directory(Path("tests/fixtures/xbrl/aapl/10k_2023")) + df = aapl.facts.to_dataframe() + + assert "currency" in df.columns + # USD-denominated facts resolve to "USD" (not the raw "usd" unit id) ... + usd = df[df["currency"] == "USD"] + assert len(usd) > 100 + assert (usd["unit_ref"] == "usd").any() + # ... and per-share amounts (unit_ref "usdPerShare") also resolve to USD. + assert (df.loc[df["unit_ref"] == "usdPerShare", "currency"] == "USD").all() + # Silence check: share-count facts are not a currency, so currency is None. + shares = df[df["unit_ref"] == "shares"] + assert len(shares) > 0 + assert shares["currency"].isna().all()