Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- **`currency` column on the XBRL facts DataFrame** — `xbrl().facts.to_dataframe()` now includes a `currency` column with each fact's ISO 4217 code (e.g. `USD`, `HKD`) resolved from its unit measure. Non-USD filers tag monetary facts with opaque unit ids such as `UNIT_STANDARD_HKD_MNUSOXGRF0O9R60JINVDUQ`, which were exposed verbatim in `unit_ref` and made currency-based filtering and display unreliable; the raw `unit_ref` is preserved, while `currency` gives a usable code. Per-share monetary units report their numerator currency, and non-monetary units (shares, pure, custom) resolve to `None` rather than a misleading value. ([#850](https://github.com/dgunning/edgartools/issues/850))

### Fixed

- **TTM Q4 derivation no longer produces wrong/negative values for discrete-quarter reporters** — when a concept is reported as discrete quarters with no cumulative 9-month YTD fact (common for BDCs and investment companies), `TTMCalculator` derives Q4 as `FY - (Q1+Q2+Q3)`. It previously selected the three input quarters by their `fiscal_period` label, but the SEC tags comparative facts in re-filings with the *filing's* fiscal period, so the same calendar quarter could appear labeled Q1, Q2 and Q3 across successive 10-Qs — producing a wrong, often negative Q4 (e.g. GAIN `InvestmentCompanyDividendDistribution`: `57.2M - 3×28.8M = -29.2M`). Quarters are now selected by distinct calendar period (dedup by `period_end`, latest periodic filing wins), and derivation is skipped when a discrete Q4 is already reported. This affects `quarterize()`, TTM calculations, and quarterly statement views. ([#848](https://github.com/dgunning/edgartools/issues/848))
Expand Down
35 changes: 35 additions & 0 deletions edgar/xbrl/facts.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,36 @@ def _deduplicate_facts(df: pd.DataFrame) -> pd.DataFrame:
return df


def _iso4217_code(measure: Optional[str]) -> Optional[str]:
"""Return the ISO 4217 code of an ``iso4217:`` unit measure, else ``None``."""
if measure and measure.startswith('iso4217:'):
return measure[len('iso4217:'):]
return None


def _unit_currency(unit_info: Optional[Dict[str, Any]]) -> Optional[str]:
"""Resolve a parsed XBRL unit to its ISO 4217 currency code, or ``None``.

Currency facts use a simple ``iso4217:`` measure (e.g. ``iso4217:HKD`` ->
``HKD``). Per-share monetary facts use a ``divide`` unit whose numerator is
the currency (e.g. ``iso4217:USD`` per ``xbrli:shares``), so the numerator
currency is returned. Non-monetary units (shares, pure, ...) return ``None``.
The opaque ``unit_ref`` id itself (e.g. ``UNIT_STANDARD_HKD_...``) is never
parsed -- only the resolved measure is used (see issue #850).
"""
if not unit_info:
return None
unit_type = unit_info.get('type')
if unit_type == 'simple':
return _iso4217_code(unit_info.get('measure'))
if unit_type == 'divide':
for measure in unit_info.get('numerator', []):
code = _iso4217_code(measure)
if code:
return code
return None


class FactQuery:
"""
A query builder for XBRL facts that enables filtering by various attributes.
Expand Down Expand Up @@ -1005,6 +1035,10 @@ def get_facts(self) -> List[Dict[str, Any]]:
# Build enriched facts from raw facts, contexts, and elements
enriched_facts = []

# Resolve each fact's opaque unit_ref to its ISO 4217 currency once
# (e.g. "UNIT_STANDARD_HKD_..." -> "HKD"); see issue #850.
units = self.xbrl.units

for fact_key, fact in self.xbrl._facts.items():
# Create a dict with only necessary fields instead of full model_dump
fact_dict = {
Expand All @@ -1014,6 +1048,7 @@ def get_facts(self) -> List[Dict[str, Any]]:
'context_ref': fact.context_ref,
'value': fact.value,
'unit_ref': fact.unit_ref,
'currency': _unit_currency(units.get(fact.unit_ref)) if fact.unit_ref else None,
'decimals': fact.decimals,
'numeric_value': fact.numeric_value
}
Expand Down
87 changes: 87 additions & 0 deletions tests/issues/regression/test_issue_850.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Regression test for Issue #850: Normalize currency unit identifiers to ISO 4217.

Problem: ``xbrl().facts.to_dataframe()`` exposed the raw XBRL ``unit_ref`` id for
every fact. For non-USD filers the unit id is an opaque token such as
``UNIT_STANDARD_HKD_MNUSOXGRF0O9R60JINVDUQ`` instead of a usable currency, which
made currency-based filtering and display unreliable for foreign companies.

Fix: the facts DataFrame now carries a ``currency`` column that resolves each
fact's unit to its ISO 4217 code (e.g. ``HKD``) via the parsed unit measure
(``iso4217:HKD``). The opaque ``unit_ref`` is preserved unchanged; non-monetary
units (shares, pure, custom) resolve to ``None`` rather than a misleading value.

Reporter: warzoo
See: https://github.com/dgunning/edgartools/issues/850
"""

from pathlib import Path

from edgar.xbrl import XBRL

# A minimal XBRL instance whose currency unit uses the opaque id from the issue.
FOREIGN_INSTANCE = """<?xml version="1.0" encoding="UTF-8"?>
<xbrl xmlns="http://www.xbrl.org/2003/instance"
xmlns:iso4217="http://www.xbrl.org/2003/iso4217"
xmlns:xbrli="http://www.xbrl.org/2003/instance"
xmlns:us-gaap="http://fasb.org/us-gaap/2023">
<context id="c1">
<entity><identifier scheme="http://www.sec.gov/CIK">0001234567</identifier></entity>
<period><startDate>2024-01-01</startDate><endDate>2024-12-31</endDate></period>
</context>
<unit id="UNIT_STANDARD_HKD_MNUSOXGRF0O9R60JINVDUQ">
<measure>iso4217:HKD</measure>
</unit>
<us-gaap:Revenues contextRef="c1"
unitRef="UNIT_STANDARD_HKD_MNUSOXGRF0O9R60JINVDUQ" decimals="-3">1500000</us-gaap:Revenues>
</xbrl>
"""


def test_unit_currency_helper():
"""``_unit_currency`` extracts the ISO 4217 code, ignoring non-currency units."""
from edgar.xbrl.facts import _unit_currency

assert _unit_currency({"type": "simple", "measure": "iso4217:HKD"}) == "HKD"
assert _unit_currency({"type": "simple", "measure": "iso4217:USD"}) == "USD"
# Per-share monetary units (currency / shares) report the numerator currency.
assert _unit_currency({"type": "divide", "numerator": ["iso4217:USD"], "denominator": ["xbrli:shares"]}) == "USD"
# Non-monetary or unknown units resolve to None, not a misleading value.
assert _unit_currency({"type": "simple", "measure": "shares"}) is None
assert _unit_currency({"type": "simple", "measure": "xbrli:pure"}) is None
assert _unit_currency(None) is None


def test_opaque_unit_ref_resolves_to_iso4217_currency(tmp_path):
"""The issue's repro: an opaque HKD unit id resolves to ``HKD`` in ``currency``."""
instance_file = tmp_path / "foreign.xml"
instance_file.write_text(FOREIGN_INSTANCE)

xbrl = XBRL.from_files(instance_file=instance_file)
df = xbrl.facts.to_dataframe()

assert "currency" in df.columns
revenue = df[df["concept"].str.contains("Revenue", case=False, na=False)]
assert len(revenue) == 1
# The opaque unit_ref is preserved unchanged ...
assert revenue["unit_ref"].iloc[0] == "UNIT_STANDARD_HKD_MNUSOXGRF0O9R60JINVDUQ"
# ... and the currency is now the usable ISO 4217 code.
assert revenue["currency"].iloc[0] == "HKD"


def test_currency_column_ground_truth_aapl():
"""A real filing (AAPL 10-K) resolves monetary facts to their currency codes."""
aapl = XBRL.from_directory(Path("tests/fixtures/xbrl/aapl/10k_2023"))
df = aapl.facts.to_dataframe()

assert "currency" in df.columns
# USD-denominated facts resolve to "USD" (not the raw "usd" unit id) ...
usd = df[df["currency"] == "USD"]
assert len(usd) > 100
assert (usd["unit_ref"] == "usd").any()
# ... and per-share amounts (unit_ref "usdPerShare") also resolve to USD.
assert (df.loc[df["unit_ref"] == "usdPerShare", "currency"] == "USD").all()
# Silence check: share-count facts are not a currency, so currency is None.
shares = df[df["unit_ref"] == "shares"]
assert len(shares) > 0
assert shares["currency"].isna().all()