Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 140 additions & 10 deletions lib/gear/spatialhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ class CosMxHandler(SpatialHandler):
* `<dataset_id>_`'exprMat_file.csv'`: Counts matrix.
* `<dataset_id>_`'metadata_file.csv'`: Metadata file.
* `<dataset_id>_`'fov_positions_file.csv'`: Field of view file.
* (Optional) `<dataset_id>_`'tx_file.csv'`: Transcripts file
* 'CellComposite': Directory containing the images.
* 'CellLabels': Directory containing the labels.
"""
Expand All @@ -663,17 +664,17 @@ def has_images(self) -> bool:
@property
def coordinate_system(self) -> str:
"""Returns the coordinate system used by CosMx datasets."""
return "global"
return "global" # may also be "spatial"

@property
def region_id(self) -> str:
"""Returns the region ID used for spot data."""
return "instance_id"
return "cell_ID"

@property
def region_name(self) -> str:
"""Returns the name of the region used for spot data."""
return "locations"
return "fov_labels"

@property
def platform(self) -> str:
Expand All @@ -687,11 +688,124 @@ def img_name(self) -> str | None:

def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
"""
Reads and processes a CosMx spatial data file from the given filepath.
For CosMx, this is a stub and does not perform any operation.
Reads and processes a Xenium spatial data tarball from the given filepath.
Extracts required files, loads clustering and spatial data, updates gene IDs, and loads into a SpatialData object.
"""
return self
extract_dir = kwargs.get("extract_dir", '/tmp/')
extract_dir = os.path.join(extract_dir, 'files')

if filepath.endswith(".tar.gz"):
mode = "r:gz" # Read as gzipped tar file
elif filepath.endswith(".tar"):
mode = "r" # Read as plain tar file
else:
raise Exception("File must be a .tar or .tar.gz file.")

_remove_dir(extract_dir)

transcripts_present = False

with tarfile.open(filepath, mode) as tf:
for entry in tf:
# Skip any BSD tar artifacts, like files that start with ._ or .DS_Store
if ".DS_Store" in entry.name or "._" in entry.name:
continue

# IF file is gzipped, gunzip it
if entry.name.endswith(".gz"):
entry_io = tf.extractfile(entry)
if entry_io is None:
raise Exception("Error occurred while extracting file: ", entry.name)
with entry_io as f:
with open(os.path.join(extract_dir, entry.name[:-3]), "wb") as out_f:
out_f.write(f.read())
entry.name = entry.name[:-3] # Adjust file name

# ? We could include this to use the "points" for future additions, but not including it saves space in the output Zarr
if entry.name.endswith("tx_file.csv"):
transcripts_present = True

# For the exprMat_file.csv, fov_positions_file.csv, and metadata_file.csv files, replace the dataset_id prefix with "spatialdata" to standardize downstream usage
if any(entry.name.endswith(suffix) for suffix in ["exprMat_file.csv", "fov_positions_file.csv", "metadata_file.csv"]):
new_name = entry.name.split("_", 1)[-1] # Remove the dataset_id prefix
new_name = "spatialdata_" + new_name # Add the standard prefix
entry.name = new_name

# For the CellComposite or CellLabels directories, strip off the dataset_id prefix to standardize downstream usage
if any(entry.name.startswith(prefix) for prefix in ["CellComposite", "CellLabels"]):
new_name = entry.name.split("_", 1)[-1] # Remove the dataset_id prefix
entry.name = new_name

# Extract file into tmp dir
filepath = "{0}/{1}".format(extract_dir, entry.name)
tf.extract(entry, path=extract_dir)

# Try to get organism id directly or through dataset metadata
organism_id = kwargs.get("organism_id", None)
if organism_id is None and "dataset_id" in kwargs:
from geardb import get_dataset_by_id
dataset = get_dataset_by_id(kwargs.get("dataset_id")) # assumes the metadata is already present
if dataset:
organism_id = dataset.organism_id
if organism_id is None:
raise Exception("Organism ID not found in dataset metadata or provided as an argument.")

# In the metadata_file.csv file, rename the "cell_id" column if it exists, as it is redundant with the "cell_ID" column
metadata_csv_path = "{}/metadata_file.csv".format(extract_dir)
if os.path.exists(metadata_csv_path):
metadata_df = pd.read_csv(metadata_csv_path)
if "cell_id" in metadata_df.columns:
metadata_df = metadata_df.rename(columns={"cell_id": "orig_cell_id"})
metadata_df.to_csv(metadata_csv_path, index=False)

# If clustering file does not exist, raise an exception
# TODO: Figure out how to implement this
# Barcode needs to be a combination of the following values <cell_ID>_<fov>
clustering_csv_path = "{}/clusters.csv".format(extract_dir)
if not os.path.exists(clustering_csv_path):
raise Exception("clusters.csv file not found in tarball.")

# If clustering file does not have "Barcode" and "Cluster" columns, raise an exception
#with open(clustering_csv_path, 'r') as f:
# first_line = f.readline()
# if "Barcode" not in first_line or "Cluster" not in first_line:
# raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.")

try:
sdata = sdio.cosmx(extract_dir
, dataset_id="spatialdata" # Provide a name to standarize downstream usage
, transcripts=transcripts_present
)
except Exception:
raise

# add clustering information to the vis_sdata.table.obs dataframe
#clustering = pd.read_csv(clustering_csv_path)
# make barcode as index
#clustering = clustering.set_index('Barcode')
#sdata.tables[self.NORMALIZED_TABLE_NAME].obs['clusters'] = clustering['Cluster'].astype('category')
# If all clusters are missing, raise an exception
#if sdata.tables[self.NORMALIZED_TABLE_NAME].obs['clusters'].isna().all():
# raise Exception("All cluster values are missing in clusters.csv file in tarball.")

# The Space Ranger h5 matrix has the gene names as the index, need to move them to a column and set the index to the ensembl id
sdata.tables[self.NORMALIZED_TABLE_NAME].var_names_make_unique()

# currently gene symbols are the index, need to move them to a column
sdata.tables[self.NORMALIZED_TABLE_NAME].var["gene_symbol"] = sdata.tables[self.NORMALIZED_TABLE_NAME].var.index

# Add ensemble IDs to the adata.var
sdata.tables[self.NORMALIZED_TABLE_NAME] = update_adata_with_ensembl_ids(sdata.tables[self.NORMALIZED_TABLE_NAME], organism_id, "UNMAPPED_")

# Rename the "CenterX_global_px" column to "spatial1" and the "CenterY_global_px" column to "spatial2" in the observation table
sdata.tables[self.NORMALIZED_TABLE_NAME].obs = sdata.tables[self.NORMALIZED_TABLE_NAME].obs.rename(
columns={"CenterX_global_px": "spatial1", "CenterY_global_px": "spatial2"}
)

self.sdata = sdata
self.standardize_sdata()
self.originalFile = filepath
return self

class CurioHandler(SpatialHandler):
"""
Expand Down Expand Up @@ -810,7 +924,10 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
var_features_moransi.to_csv(spatial_moransi_file, sep="\t", header=True, index=True, index_label=False)

# Now are ready to read in to a SpatialData object
sdata = sdio.curio(extract_dir)
try:
sdata = sdio.curio(extract_dir)
except Exception:
raise

# To get the adata equivalent, look at sdata.tables["table"]

Expand Down Expand Up @@ -1073,7 +1190,10 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
if "Barcode" not in first_line or "Cluster" not in first_line:
raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.")

sdata = sdio.visium(path=extract_dir, dataset_id="spatialdata") # Provide a name to standarize downstream usage
try:
sdata = sdio.visium(path=extract_dir, dataset_id="spatialdata") # Provide a name to standarize downstream usage
except Exception:
raise

# add clustering information to the vis_sdata.table.obs dataframe
clustering = pd.read_csv(clustering_csv_path)
Expand Down Expand Up @@ -1151,6 +1271,10 @@ def img_name(self) -> str | None:
return "spatialdata_hires_image"

def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
"""
Reads and processes a Xenium spatial data tarball from the given filepath.
Extracts required files, loads clustering and spatial data, updates gene IDs, and loads into a SpatialData object.
"""
extract_dir = kwargs.get("extract_dir", '/tmp/')
extract_dir = os.path.join(extract_dir, 'files')

Expand Down Expand Up @@ -1203,14 +1327,17 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
if not os.path.exists("{}/spatialdata_feature_slice.h5".format(binned_outputs_dir)):
os.symlink("{}/feature_slice.h5".format(absolute_path), "{}/spatialdata_feature_slice.h5".format(binned_outputs_dir))

sdata = sdio.visium_hd(binned_outputs_dir
try:
sdata = sdio.visium_hd(binned_outputs_dir
, dataset_id="spatialdata" # Provide a name to standarize downstream usage
, bin_size=8
, filtered_counts_file=True
, load_all_images=False # CytAssist image is not helpful for us.
, fullres_image_file=None
, bins_as_squares=True
)
except Exception:
raise

# add clustering information to the vis_sdata.table.obs dataframe
clustering = pd.read_csv(clustering_csv_path)
Expand Down Expand Up @@ -1340,7 +1467,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
if "Barcode" not in first_line or "Cluster" not in first_line:
raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.")

sdata = sdio.xenium(extract_dir
try:
sdata = sdio.xenium(extract_dir
, cells_labels=False # Avoid adding polygons to SpatialData object (for now due to out-of-memory issues)
, nucleus_labels=False
, cell_boundaries=cell_boundaries_present
Expand All @@ -1349,6 +1477,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
, cells_as_circles=True # Table is associated with the cells instead of the nuclei (faster performance)
, morphology_mip=False # Using the morphology_focus image instead
)
except Exception:
raise

# In code, it seems that the Xenium reader is supposed to set the index to the "barcodes" column
# But this column is not found, so we need to manually replace with "cell_id"
Expand Down