From 5fcfc2d26feb14f77735ff807577b40153f9c97b Mon Sep 17 00:00:00 2001 From: Jessica Krick Date: Mon, 8 Dec 2025 21:02:46 -0500 Subject: [PATCH 1/5] totally floundering with TAP --- tutorials/cosmodc2/cosmoDC2_TAP_access.md | 183 +++++++++++++++++++++- 1 file changed, 175 insertions(+), 8 deletions(-) diff --git a/tutorials/cosmodc2/cosmoDC2_TAP_access.md b/tutorials/cosmodc2/cosmoDC2_TAP_access.md index a4fcd851..624c7260 100644 --- a/tutorials/cosmodc2/cosmoDC2_TAP_access.md +++ b/tutorials/cosmodc2/cosmoDC2_TAP_access.md @@ -5,7 +5,7 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.16.2 + jupytext_version: 1.18.1 kernelspec: display_name: Python 3 (ipykernel) language: python @@ -14,14 +14,19 @@ execution: timeout: 2600 --- - - # Querying CosmoDC2 Mock v1 catalogs This tutorial demonstrates how to access the CosmoDC2 Mock V1 catalogs. More information about these catalogs can be found here: https://irsa.ipac.caltech.edu/Missions/cosmodc2.html These catalogs can be accessed through IRSA's Virtual Ovservatory Table Access Protocol (TAP) service. See https://www.ivoa.net/documents/TAP/ for details on the protocol. This service can be accessed through Python using the PyVO library. +Tips: +- This catalog is spatially indexed so searching on position will be the fastest searches. +- If searching on position, make sure to search in the area that cosmoDC3 covers. + - CosmoDC2 covers an area of roughly: RA ≈ 0° to 60°, Dec ≈ –45° to 0°, but not exactly +- run queries in a way that can be cancelled. It is very easy to overload the TAP server + with multiple queries and not be able to cancel them if you just do a run_sync. + ```{code-cell} ipython3 # Uncomment the next line to install dependencies if needed. # !pip install numpy matplotlib pyvo @@ -64,6 +69,133 @@ If you are new to the DC2 catalog, we recommend that you start with ``cosmodc2mo tablename = 'cosmodc2mockv1_heavy' ``` +# testing + +```{code-cell} ipython3 +# Test the TAP service health +test_adql = f"SELECT TOP 1 redshift FROM {tablename}" +job = service.submit_job(test_adql) +job.run() +results = job.fetch_result() +print(len(results)) +``` + +```{code-cell} ipython3 +job.phase +``` + +```{code-cell} ipython3 +import time + +def wait_for_job(job, timeout=60, poll_interval=5.0): + """ + Wait until a TAP job reaches COMPLETED or ERROR. + If it is still EXECUTING after timeout seconds, cancel it. + + Parameters + ---------- + job : pyvo.dal.AsyncTAPJob + The TAP job object from service.submit_job(). + timeout : int or float + Max time (seconds) to wait before auto-cancelling. + poll_interval : float + Seconds between job.phase checks. + + Returns + ------- + bool + True if COMPLETED; False if ERROR, ABORTED, or timeout-cancelled. + """ + start = time.time() + + while True: + phase = job.phase + print("Phase:", phase) + + # Normal success + if phase == "COMPLETED": + return True + + # Hard failure + if phase in ("ERROR", "ABORTED", "UNKNOWN"): + return False + + # Timeout reached — cancel it + if time.time() - start > timeout: + print(f"Timeout reached ({timeout} s). Aborting TAP job...") + try: + job.abort() + except Exception as e: + print("Abort raised exception:", e) + return False + + time.sleep(poll_interval) +``` + +```{code-cell} ipython3 +test_small = "SELECT TOP 1 * FROM tap_schema.tables" + +t0 = time.time() +job = service.submit_job(test_small) +print("Submit time:", time.time() - t0) +``` + +```{code-cell} ipython3 +import time + +# ------- Test A: (baseline speed) ------- + +test_adql_A = f"SELECT TOP 1 redshift FROM {tablename}" +t0 = time.time() +jobA = service.submit_job(test_adql_A) +t_submit = time.time() +print("submit job took", t_submit - t0, "s") + +jobA.run() +if wait_for_job(jobA, timeout=60): + results = jobA.fetch_result() + print("Rows:", len(results)) +else: + print("Job did NOT finish — cancelled or failed.") + +tA = time.time() - t_submit + +print("No-WHERE TOP1 query time:", tA, "seconds") +print("Rows returned:", len(resA)) +``` + +```{code-cell} ipython3 +# ------- Test B: RA/Dec CONTAINS query (index test) ------- +test_adql_B = f""" +SELECT TOP 1 redshift +FROM {tablename} +WHERE CONTAINS( + POINT('ICRS', RAMean, DecMean), + CIRCLE('ICRS', 30.0, -36.0, 0.2) +) = 1 +""" + +jobB = service.submit_job(test_adql_B) + +t0 = time.time() +jobB.run() + +if wait_for_job(jobB, timeout=20): + results = jobB.fetch_result() + print("Rows:", len(results)) +else: + print("Job did NOT finish — cancelled or failed.") + +tB = time.time() - t0 + +print("Cone-search TOP1 query time:", tB, "seconds") +print("Rows returned:", len(resB)) +``` + +```{code-cell} ipython3 +print(jobB.phase) +``` + ## How many rows are in the chosen table? With TAP, you can query catalogs with constraints specified in IVOA Astronomical Data Query Language (ADQL; https://www.ivoa.net/documents/latest/ADQL.html), which is based on SQL. @@ -118,9 +250,9 @@ for col in columns: print(f'{f"{col.name}":30s} {col.description}') ``` -## Create a histogram of redshifts +## Spatial Search -Let's figure out what redshift range these galaxies cover. Since we found out above that it's a large catalog, we can start with a spatial search over a small area of 0.1 deg. The ADQL that is needed for the spatial constraint is: +Since we found out above that cosmoDC2 a large catalog, we can start with a spatial search over a small area of 0.1 deg. The ADQL that is needed for the spatial constraint is shown below. We then show how to make a redshift histogram of the sample generated. ```{code-cell} ipython3 adql = f"SELECT redshift FROM {tablename} WHERE CONTAINS(POINT('ICRS', RAMean, DecMean), CIRCLE('ICRS',54.218205903,-37.497959343,.1))=1" @@ -152,16 +284,50 @@ We can easily see form this plot that the simulated galaxies go out to z = 3. +++ -## Visualize galaxy colors at z ~ 0.5 +## Visualize galaxy colors at z ~ 2.0 Now let's visualize the galaxy main sequence at z = 2.0. First, we'll do a narrow redshift cut with no spatial constraint. Let's do it as an asynchronous search since this might take awhile, too. +```{code-cell} ipython3 +adql = f"SELECT TOP 5 redshift FROM {tablename}" +results = service.run_sync(adql) +``` + ```{code-cell} ipython3 service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") -adql = f"SELECT Mag_true_r_sdss_z0, Mag_true_g_sdss_z0, redshift FROM {tablename} WHERE redshift > 0.5 and redshift < 0.54" -results = service.run_async(adql) +result = service.run_sync("SELECT TOP 1 * FROM tap_schema.tables") +``` + +```{code-cell} ipython3 +result = service.run_sync("SELECT TOP 5 redshift FROM cosmodc2mockv1_heavy") +``` + +```{code-cell} ipython3 +service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") +jobs = service.get_jobs() +jobs +``` + +```{code-cell} ipython3 +service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") + +#setup the query +adql = f""" +SELECT TOP 50000 + Mag_true_r_sdss_z0, + Mag_true_g_sdss_z0, + redshift +FROM {tablename} +WHERE CONTAINS( + POINT('ICRS', RAMean, DecMean), + CIRCLE('ICRS', 0.5, -43.0, 0.2) + ) = 1 + +""" +#run the query +results = service.run_sync(adql) ``` ```{code-cell} ipython3 @@ -185,6 +351,7 @@ plt.ylabel('SDSS rest-frame g-r color') # Show the plot. plt.show() ``` + *** ## About this notebook From dac5916fbc41c77ca2a03637b8ba6cc050c054de Mon Sep 17 00:00:00 2001 From: Jessica Krick Date: Tue, 9 Dec 2025 18:01:11 -0500 Subject: [PATCH 2/5] still TAP errors but text updated --- tutorials/cosmodc2/cosmoDC2_TAP_access.md | 228 ++++++++++++---------- 1 file changed, 120 insertions(+), 108 deletions(-) diff --git a/tutorials/cosmodc2/cosmoDC2_TAP_access.md b/tutorials/cosmodc2/cosmoDC2_TAP_access.md index 624c7260..8a8cf49b 100644 --- a/tutorials/cosmodc2/cosmoDC2_TAP_access.md +++ b/tutorials/cosmodc2/cosmoDC2_TAP_access.md @@ -14,18 +14,31 @@ execution: timeout: 2600 --- -# Querying CosmoDC2 Mock v1 catalogs +# Querying the CosmoDC2 Mock v1 Catalogs -This tutorial demonstrates how to access the CosmoDC2 Mock V1 catalogs. More information about these catalogs can be found here: https://irsa.ipac.caltech.edu/Missions/cosmodc2.html +This tutorial demonstrates how to access and query the **CosmoDC2 Mock v1** catalogs using IRSA’s Table Access Protocol (TAP) service. Background information on the catalogs is available on the [IRSA CosmoDC2 page](https://irsa.ipac.caltech.edu/Missions/cosmodc2.html). -These catalogs can be accessed through IRSA's Virtual Ovservatory Table Access Protocol (TAP) service. See https://www.ivoa.net/documents/TAP/ for details on the protocol. This service can be accessed through Python using the PyVO library. +The catalogs are served through IRSA’s Virtual Observatory–standard **TAP** interface (see the [IVOA TAP specification](https://www.ivoa.net/documents/TAP/)), which you can access programmatically in Python via the **PyVO** library. TAP queries are written in the **Astronomical Data Query Language (ADQL)** — a SQL-like language designed for astronomical catalogs (see the [ADQL specification](https://www.ivoa.net/documents/latest/ADQL.html)). -Tips: -- This catalog is spatially indexed so searching on position will be the fastest searches. -- If searching on position, make sure to search in the area that cosmoDC3 covers. - - CosmoDC2 covers an area of roughly: RA ≈ 0° to 60°, Dec ≈ –45° to 0°, but not exactly -- run queries in a way that can be cancelled. It is very easy to overload the TAP server - with multiple queries and not be able to cancel them if you just do a run_sync. +If you are new to PyVO’s query modes, the documentation provides a helpful comparison between **synchronous** and **asynchronous** execution: [PyVO: Synchronous vs. Asynchronous Queries](https://pyvo.readthedocs.io/en/latest/dal/index.html#synchronous-vs-asynchronous-query) + + +## Tips for Working with CosmoDC2 via TAP + +- **Use indexed columns for fast queries.** + CosmoDC2 is indexed on the following fields: + `ra`, `dec`, `redshift`, `mag*_lsst`, `halo_mass`, `stellar_mass` + Queries involving these columns generally return much faster. + +- **Ensure your positional queries fall within the survey footprint.** + CosmoDC2 roughly covers: + - **RA:** 0° → 60° + - **Dec:** –45° → 0° + (Coverage is not perfectly rectangular, so some edges may be sparse.) + +- **Avoid overloading the TAP service.** + Prefer **asynchronous** queries (`submit_job` / `run`) so they can be monitored or cancelled. + Using `run_sync()` makes it easy to fire off queries that cannot be interrupted and may continue running on the server long after your client session stops. ```{code-cell} ipython3 # Uncomment the next line to install dependencies if needed. @@ -34,6 +47,10 @@ Tips: ```{code-cell} ipython3 import pyvo as vo +import numpy as np +import matplotlib.mlab as mlab +import matplotlib.pyplot as plt +import time ``` ```{code-cell} ipython3 @@ -123,23 +140,12 @@ def wait_for_job(job, timeout=60, poll_interval=5.0): # Timeout reached — cancel it if time.time() - start > timeout: print(f"Timeout reached ({timeout} s). Aborting TAP job...") - try: - job.abort() - except Exception as e: - print("Abort raised exception:", e) + job.abort() return False time.sleep(poll_interval) ``` -```{code-cell} ipython3 -test_small = "SELECT TOP 1 * FROM tap_schema.tables" - -t0 = time.time() -job = service.submit_job(test_small) -print("Submit time:", time.time() - t0) -``` - ```{code-cell} ipython3 import time @@ -161,7 +167,15 @@ else: tA = time.time() - t_submit print("No-WHERE TOP1 query time:", tA, "seconds") -print("Rows returned:", len(resA)) +``` + +```{code-cell} ipython3 +test_adql_A = f"SELECT TOP 1 redshift FROM {tablename}" +results = service.run_async(test_adql_A) +``` + +```{code-cell} ipython3 +results ``` ```{code-cell} ipython3 @@ -170,8 +184,8 @@ test_adql_B = f""" SELECT TOP 1 redshift FROM {tablename} WHERE CONTAINS( - POINT('ICRS', RAMean, DecMean), - CIRCLE('ICRS', 30.0, -36.0, 0.2) + POINT('ICRS', ra, dec), + CIRCLE('ICRS', 54.2, -37.4, 0.05) ) = 1 """ @@ -180,7 +194,7 @@ jobB = service.submit_job(test_adql_B) t0 = time.time() jobB.run() -if wait_for_job(jobB, timeout=20): +if wait_for_job(jobB, timeout=150): results = jobB.fetch_result() print("Rows:", len(results)) else: @@ -189,22 +203,10 @@ else: tB = time.time() - t0 print("Cone-search TOP1 query time:", tB, "seconds") -print("Rows returned:", len(resB)) ``` ```{code-cell} ipython3 -print(jobB.phase) -``` - -## How many rows are in the chosen table? - -With TAP, you can query catalogs with constraints specified in IVOA Astronomical Data Query Language (ADQL; https://www.ivoa.net/documents/latest/ADQL.html), which is based on SQL. - -```{code-cell} ipython3 -# For example, this snippet of ADQL counts the number of elements in -# the redshift column of the table we chose. -adql = f"SELECT count(redshift) FROM {tablename}" -adql +results ``` In order to use TAP with this ADQL string using pyvo, you can do the following: @@ -250,114 +252,124 @@ for col in columns: print(f'{f"{col.name}":30s} {col.description}') ``` -## Spatial Search +## Get a list of galaxies within a small area -Since we found out above that cosmoDC2 a large catalog, we can start with a spatial search over a small area of 0.1 deg. The ADQL that is needed for the spatial constraint is shown below. We then show how to make a redshift histogram of the sample generated. +Since we know that cosmoDC2 is a large catalog, we can start with a spatial search over a small square area. The ADQL that is needed for the spatial constraint is shown below. We then show how to make a redshift histogram of the sample generated. ```{code-cell} ipython3 -adql = f"SELECT redshift FROM {tablename} WHERE CONTAINS(POINT('ICRS', RAMean, DecMean), CIRCLE('ICRS',54.218205903,-37.497959343,.1))=1" -adql +# Setup the query +adql = f""" +SELECT TOP 100 redshift +FROM {tablename} +WHERE CONTAINS( + POINT('ICRS', ra, dec), + CIRCLE('ICRS', 54.2, -37.4, 0.05) +) = 1 +""" +job = service.submit_job(adql) +job.run() +if wait_for_job(job, timeout=100): + spatial_results = job.fetch_result() + print("Rows:", len(results)) +else: + print("Job did NOT finish — cancelled or failed.") ``` -Now we can use the previously-defined service to execute the query with the spatial contraint. - ```{code-cell} ipython3 -cone_results = service.run_sync(adql) +spatial_results ``` ```{code-cell} ipython3 -# Plot a histogram -import numpy as np -import matplotlib.mlab as mlab -import matplotlib.pyplot as plt - -num_bins = 20 -# the histogram of the data -n, bins, patches = plt.hist(cone_results['redshift'], num_bins, - facecolor='blue', alpha = 0.5) -plt.xlabel('Redshift') -plt.ylabel('Number') -plt.title('Redshift Histogram CosmoDC2 Mock Catalog V1 abridged') +if spatial_results: + ]# Plot a histogram + num_bins = 20 + # the histogram of the data + n, bins, patches = plt.hist(spatial_results['redshift'], num_bins, + facecolor='blue', alpha = 0.5) + plt.xlabel('Redshift') + plt.ylabel('Number') + plt.title('Redshift Histogram CosmoDC2 Mock Catalog V1 abridged') ``` -We can easily see form this plot that the simulated galaxies go out to z = 3. +We can see form this plot that the simulated galaxies go out to z = 3. +++ -## Visualize galaxy colors at z ~ 2.0 - -Now let's visualize the galaxy main sequence at z = 2.0. First, we'll do a narrow redshift cut with no spatial constraint. - -Let's do it as an asynchronous search since this might take awhile, too. - -```{code-cell} ipython3 -adql = f"SELECT TOP 5 redshift FROM {tablename}" -results = service.run_sync(adql) -``` - -```{code-cell} ipython3 -service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") -result = service.run_sync("SELECT TOP 1 * FROM tap_schema.tables") -``` +## Visualize galaxy colors: redshift search -```{code-cell} ipython3 -result = service.run_sync("SELECT TOP 5 redshift FROM cosmodc2mockv1_heavy") -``` - -```{code-cell} ipython3 -service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") -jobs = service.get_jobs() -jobs -``` +First, we'll do a narrow redshift cut with no spatial constraint. Then, from that redshift sample we will visualize the galaxy main sequence at z = 2.0. ```{code-cell} ipython3 -service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") - -#setup the query +# Setup the query adql = f""" SELECT TOP 50000 - Mag_true_r_sdss_z0, - Mag_true_g_sdss_z0, + Mag_r_LSST, + Mag_g_LSST, redshift FROM {tablename} -WHERE CONTAINS( - POINT('ICRS', RAMean, DecMean), - CIRCLE('ICRS', 0.5, -43.0, 0.2) - ) = 1 +WHERE redshift > 1.95 and redshift < 2.05 """ -#run the query -results = service.run_sync(adql) +# Run the query +job = service.submit_job(adql) +job.run() + +#if the job does not finish in a reasonable amount of time, cancel it +if wait_for_job(job, timeout=1000): + redshift_results = job.fetch_result() + print("Rows:", len(results)) +else: + print("Job did NOT finish — cancelled or failed.") ``` ```{code-cell} ipython3 -len(results['mag_true_r_sdss_z0']) +redshift_results ``` ```{code-cell} ipython3 -# Since this results in almost 4 million galaxies, -# we will construct a 2D histogram rather than a scatter plot. -plt.hist2d(results['mag_true_r_sdss_z0'], results['mag_true_g_sdss_z0']-results['mag_true_r_sdss_z0'], - bins=200, cmap='plasma', cmax=500) +if redshift_results: + # Construct a 2D histogram of the galaxy colors + plt.hist2d(redshift_results['mag_r_lsst'], redshift_results['mag_g_lsst']-redshift_results['mag_r_lsst'], + bins=100, cmap='plasma', cmax=500) -# Plot a colorbar with label. -cb = plt.colorbar() -cb.set_label('Number') + # Plot a colorbar with label. + cb = plt.colorbar() + cb.set_label('Number') -# Add title and labels to plot. -plt.xlabel('SDSS Mag r') -plt.ylabel('SDSS rest-frame g-r color') + # Add title and labels to plot. + plt.xlabel('LSST Mag r') + plt.ylabel('LSST rest-frame g-r color') -# Show the plot. -plt.show() + # Show the plot. + plt.show() ``` ++++ {"jp-MarkdownHeadingCollapsed": true} + +## Suggestions for further queries: +TAP queries are extremely powerful and provide flexible ways to explore large catalogs like CosmoDC2, including spatial searches, photometric selections, cross-matching, and more. However, many valid ADQL queries can take minutes or longer to complete due to the size of the catalog, so we avoid running those directly in this tutorial. Instead, the examples here have so far focused on fast, lightweight queries that illustrate the key concepts without long wait times. If you are interested in exploring further, here are some additional query ideas that are scientifically useful but may take longer to run depending on server conditions. + +### How many redshifts are in the chosen table? +`adql = f"SELECT count(redshift) FROM {tablename}" #answer: 597,488,849 redshifts` + +### Retrieve only a subset of columns (recommended for speed) +This use of "TOP 5000" just limits the number of rows returned. Remove it if you want all rows + +`adql = f"SELECT TOP 5000 ra, dec, redshift, stellar_mass FROM {tablename}"` + +### Cone search around a specific position +This search is slower than the spatial search above because it uses "contains" which does not take advantage of position indexing. + +`adql = f""" SELECT TOP 50000 redshift FROM {tablename} WHERE CONTAINS(POINT('ICRS', RAMean, DecMean), CIRCLE('ICRS',54.2, -37.5,.1))=1` + ++++ + *** ## About this notebook -**Author:** Vandana Desai (IRSA Science Lead) +**Author:** IRSA Data Science Team, including Vandana Desai, Jessica Krick, Troy Raen, Brigitta Sipőcz, Andreas Faisst, Jaladh Singhal -**Updated:** 2024-07-24 +**Updated:** December 2025 **Contact:** [the IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or reporting problems. From a8839a3da11bc8c6088efb7cdc944db23dbb66ad Mon Sep 17 00:00:00 2001 From: Jessica Krick Date: Fri, 12 Dec 2025 20:41:23 -0500 Subject: [PATCH 3/5] fully functional code, improved outline and text --- tutorials/cosmodc2/cosmoDC2_TAP_access.md | 268 +++++++--------------- 1 file changed, 81 insertions(+), 187 deletions(-) diff --git a/tutorials/cosmodc2/cosmoDC2_TAP_access.md b/tutorials/cosmodc2/cosmoDC2_TAP_access.md index 8a8cf49b..f6147c10 100644 --- a/tutorials/cosmodc2/cosmoDC2_TAP_access.md +++ b/tutorials/cosmodc2/cosmoDC2_TAP_access.md @@ -31,14 +31,14 @@ If you are new to PyVO’s query modes, the documentation provides a helpful com Queries involving these columns generally return much faster. - **Ensure your positional queries fall within the survey footprint.** - CosmoDC2 roughly covers: - - **RA:** 0° → 60° - - **Dec:** –45° → 0° - (Coverage is not perfectly rectangular, so some edges may be sparse.) + CosmoDC2 covers the area specified by the + following (R.A., decl.) coordinate pairs (J2000): + (71.46,−27.25), (52.25,−27.25), + (73.79,−44.33), (49.42,−44.33). - **Avoid overloading the TAP service.** - Prefer **asynchronous** queries (`submit_job` / `run`) so they can be monitored or cancelled. - Using `run_sync()` makes it easy to fire off queries that cannot be interrupted and may continue running on the server long after your client session stops. + Preferentially use **asynchronous** queries for long running queries to avoid timing out. The whole system will slow down if a lot of people are using it for large queries, or if you decide to kick off many large queries at the same time. + ```{code-cell} ipython3 # Uncomment the next line to install dependencies if needed. @@ -57,7 +57,7 @@ import time service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") ``` -## List the available DC2 tables +## 1. List the available DC2 tables ```{code-cell} ipython3 tables = service.tables @@ -67,7 +67,7 @@ for tablename in tables.keys(): tables[tablename].describe() ``` -## Choose the DC2 catalog you want to work with. +## 2. Choose the DC2 catalog you want to work with. IRSA currently offers 3 versions of the DC2 catalog. @@ -86,143 +86,7 @@ If you are new to the DC2 catalog, we recommend that you start with ``cosmodc2mo tablename = 'cosmodc2mockv1_heavy' ``` -# testing - -```{code-cell} ipython3 -# Test the TAP service health -test_adql = f"SELECT TOP 1 redshift FROM {tablename}" -job = service.submit_job(test_adql) -job.run() -results = job.fetch_result() -print(len(results)) -``` - -```{code-cell} ipython3 -job.phase -``` - -```{code-cell} ipython3 -import time - -def wait_for_job(job, timeout=60, poll_interval=5.0): - """ - Wait until a TAP job reaches COMPLETED or ERROR. - If it is still EXECUTING after timeout seconds, cancel it. - - Parameters - ---------- - job : pyvo.dal.AsyncTAPJob - The TAP job object from service.submit_job(). - timeout : int or float - Max time (seconds) to wait before auto-cancelling. - poll_interval : float - Seconds between job.phase checks. - - Returns - ------- - bool - True if COMPLETED; False if ERROR, ABORTED, or timeout-cancelled. - """ - start = time.time() - - while True: - phase = job.phase - print("Phase:", phase) - - # Normal success - if phase == "COMPLETED": - return True - - # Hard failure - if phase in ("ERROR", "ABORTED", "UNKNOWN"): - return False - - # Timeout reached — cancel it - if time.time() - start > timeout: - print(f"Timeout reached ({timeout} s). Aborting TAP job...") - job.abort() - return False - - time.sleep(poll_interval) -``` - -```{code-cell} ipython3 -import time - -# ------- Test A: (baseline speed) ------- - -test_adql_A = f"SELECT TOP 1 redshift FROM {tablename}" -t0 = time.time() -jobA = service.submit_job(test_adql_A) -t_submit = time.time() -print("submit job took", t_submit - t0, "s") - -jobA.run() -if wait_for_job(jobA, timeout=60): - results = jobA.fetch_result() - print("Rows:", len(results)) -else: - print("Job did NOT finish — cancelled or failed.") - -tA = time.time() - t_submit - -print("No-WHERE TOP1 query time:", tA, "seconds") -``` - -```{code-cell} ipython3 -test_adql_A = f"SELECT TOP 1 redshift FROM {tablename}" -results = service.run_async(test_adql_A) -``` - -```{code-cell} ipython3 -results -``` - -```{code-cell} ipython3 -# ------- Test B: RA/Dec CONTAINS query (index test) ------- -test_adql_B = f""" -SELECT TOP 1 redshift -FROM {tablename} -WHERE CONTAINS( - POINT('ICRS', ra, dec), - CIRCLE('ICRS', 54.2, -37.4, 0.05) -) = 1 -""" - -jobB = service.submit_job(test_adql_B) - -t0 = time.time() -jobB.run() - -if wait_for_job(jobB, timeout=150): - results = jobB.fetch_result() - print("Rows:", len(results)) -else: - print("Job did NOT finish — cancelled or failed.") - -tB = time.time() - t0 - -print("Cone-search TOP1 query time:", tB, "seconds") -``` - -```{code-cell} ipython3 -results -``` - -In order to use TAP with this ADQL string using pyvo, you can do the following: - -```{code-cell} ipython3 -# Uncomment the next line to run the query. Beware that it can take awhile. -# service.run_async(adql) -``` - -The above query shows that there are 597,488,849 redshifts in this table. -Running ``count`` on an entire table is an expensive operation, therefore we ran it asynchronously to avoid any potential timeout issues. -To learn more about synchronous versus asynchronous PyVO queries please read the [relevant PyVO documentation](https://pyvo.readthedocs.io/en/latest/dal/index.html#synchronous-vs-asynchronous-query). - -+++ - -## What is the default maximum number of rows returned by the service? +## 3. What is the default maximum number of rows returned by the service? This service will return a maximum of 2 billion rows by default. @@ -236,7 +100,7 @@ This default maximum can be changed, and there is no hard upper limit to what it print(service.hardlimit) ``` -## List the columns in the chosen table +## 4. List the columns in the chosen table This table contains 301 columns. @@ -252,50 +116,49 @@ for col in columns: print(f'{f"{col.name}":30s} {col.description}') ``` -## Get a list of galaxies within a small area +## 5. Retrieve a list of galaxies within a small area Since we know that cosmoDC2 is a large catalog, we can start with a spatial search over a small square area. The ADQL that is needed for the spatial constraint is shown below. We then show how to make a redshift histogram of the sample generated. ```{code-cell} ipython3 # Setup the query adql = f""" -SELECT TOP 100 redshift +SELECT redshift FROM {tablename} WHERE CONTAINS( POINT('ICRS', ra, dec), - CIRCLE('ICRS', 54.2, -37.4, 0.05) + CIRCLE('ICRS', 54.0, -37.0, 0.05) ) = 1 """ -job = service.submit_job(adql) -job.run() -if wait_for_job(job, timeout=100): - spatial_results = job.fetch_result() - print("Rows:", len(results)) -else: - print("Job did NOT finish — cancelled or failed.") + +cone_results = service.run_sync(adql) ``` ```{code-cell} ipython3 -spatial_results +#how many redshifts does this return? +print(len(cone_results)) ``` ```{code-cell} ipython3 -if spatial_results: - ]# Plot a histogram +# Now that we have a list of galaxy redshifts in that region, we can +# create a histogram of the redshifts to see what redshifts this survey includes. + +if cone_results: + # Plot a histogram num_bins = 20 # the histogram of the data - n, bins, patches = plt.hist(spatial_results['redshift'], num_bins, + n, bins, patches = plt.hist(cone_results['redshift'], num_bins, facecolor='blue', alpha = 0.5) plt.xlabel('Redshift') plt.ylabel('Number') - plt.title('Redshift Histogram CosmoDC2 Mock Catalog V1 abridged') + plt.title(f'Redshift Histogram {tablename}') ``` We can see form this plot that the simulated galaxies go out to z = 3. +++ -## Visualize galaxy colors: redshift search +## 6. Visualize galaxy colors: redshift search First, we'll do a narrow redshift cut with no spatial constraint. Then, from that redshift sample we will visualize the galaxy main sequence at z = 2.0. @@ -303,23 +166,13 @@ First, we'll do a narrow redshift cut with no spatial constraint. Then, from th # Setup the query adql = f""" SELECT TOP 50000 - Mag_r_LSST, - Mag_g_LSST, + mag_r_lsst, + (mag_g_lsst - mag_r_lsst) AS color, redshift FROM {tablename} -WHERE redshift > 1.95 and redshift < 2.05 - +WHERE redshift BETWEEN 1.95 AND 2.05 """ -# Run the query -job = service.submit_job(adql) -job.run() - -#if the job does not finish in a reasonable amount of time, cancel it -if wait_for_job(job, timeout=1000): - redshift_results = job.fetch_result() - print("Rows:", len(results)) -else: - print("Job did NOT finish — cancelled or failed.") +redshift_results = service.run_sync(adql) ``` ```{code-cell} ipython3 @@ -329,7 +182,7 @@ redshift_results ```{code-cell} ipython3 if redshift_results: # Construct a 2D histogram of the galaxy colors - plt.hist2d(redshift_results['mag_r_lsst'], redshift_results['mag_g_lsst']-redshift_results['mag_r_lsst'], + plt.hist2d(redshift_results['mag_r_lsst'], redshift_results['color'], bins=100, cmap='plasma', cmax=500) # Plot a colorbar with label. @@ -346,26 +199,61 @@ if redshift_results: +++ {"jp-MarkdownHeadingCollapsed": true} -## Suggestions for further queries: +## 7. Suggestions for further queries: TAP queries are extremely powerful and provide flexible ways to explore large catalogs like CosmoDC2, including spatial searches, photometric selections, cross-matching, and more. However, many valid ADQL queries can take minutes or longer to complete due to the size of the catalog, so we avoid running those directly in this tutorial. Instead, the examples here have so far focused on fast, lightweight queries that illustrate the key concepts without long wait times. If you are interested in exploring further, here are some additional query ideas that are scientifically useful but may take longer to run depending on server conditions. -### How many redshifts are in the chosen table? -`adql = f"SELECT count(redshift) FROM {tablename}" #answer: 597,488,849 redshifts` +### Count the total number of redshifts in the chosen table +answer for this table= 597,488,849 redshifts +``` +adql = f"SELECT count(redshift) FROM {tablename}" +``` + +### Count galaxies in a sky region (cone search) +Useful for: estimating source density, validating spatial footprint, testing spatial completeness +``` +adql = f""" +SELECT COUNT(*) +FROM {tablename} +WHERE CONTAINS(POINT('ICRS', ra, dec), CIRCLE('ICRS', 54.2, -37.5, 0.2)) = 1 +""" +``` ### Retrieve only a subset of columns (recommended for speed) This use of "TOP 5000" just limits the number of rows returned. Remove it if you want all rows +```sql +adql = f""" +SELECT TOP 5000 + ra, + dec, + redshift, + stellar_mass +FROM {tablename}""" +``` -`adql = f"SELECT TOP 5000 ra, dec, redshift, stellar_mass FROM {tablename}"` - -### Cone search around a specific position -This search is slower than the spatial search above because it uses "contains" which does not take advantage of position indexing. +### Explore the stellar–halo mass relation +```sql +adql = f""" +SELECT TOP 500000 + stellar_mass, + halo_mass +FROM {tablename} +WHERE halo_mass > 1e11""" +``` -`adql = f""" SELECT TOP 50000 redshift FROM {tablename} WHERE CONTAINS(POINT('ICRS', RAMean, DecMean), CIRCLE('ICRS',54.2, -37.5,.1))=1` +### Find the brightest galaxies at high redshift +return the results in ascending (ASC) order by r band magnitude +```sql +adql = f""" +SELECT TOP 10000 + ra, dec, redshift, mag_r_lsst +FROM {tablename} +WHERE redshift > 2.5 +ORDER BY mag_r_lsst ASC +""" +``` +++ -*** - ## About this notebook **Author:** IRSA Data Science Team, including Vandana Desai, Jessica Krick, Troy Raen, Brigitta Sipőcz, Andreas Faisst, Jaladh Singhal @@ -373,3 +261,9 @@ This search is slower than the spatial search above because it uses "contains" w **Updated:** December 2025 **Contact:** [the IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or reporting problems. + +**Runtime:** As of the date above, this notebook takes about 10 seconds to run to completion on a machine with 8GB RAM and 2 CPU. Large variations in this runtime can be expected if the TAP server is busy with many queries at once. + +```{code-cell} ipython3 + +``` From 0e214271a9b4bd6c52e5f35587ee4231bf13bafc Mon Sep 17 00:00:00 2001 From: Jessica Krick Date: Tue, 16 Dec 2025 16:26:00 -0500 Subject: [PATCH 4/5] code review comments --- tutorials/cosmodc2/cosmoDC2_TAP_access.md | 51 +++++++++-------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/tutorials/cosmodc2/cosmoDC2_TAP_access.md b/tutorials/cosmodc2/cosmoDC2_TAP_access.md index f6147c10..7fc53266 100644 --- a/tutorials/cosmodc2/cosmoDC2_TAP_access.md +++ b/tutorials/cosmodc2/cosmoDC2_TAP_access.md @@ -18,7 +18,7 @@ execution: This tutorial demonstrates how to access and query the **CosmoDC2 Mock v1** catalogs using IRSA’s Table Access Protocol (TAP) service. Background information on the catalogs is available on the [IRSA CosmoDC2 page](https://irsa.ipac.caltech.edu/Missions/cosmodc2.html). -The catalogs are served through IRSA’s Virtual Observatory–standard **TAP** interface (see the [IVOA TAP specification](https://www.ivoa.net/documents/TAP/)), which you can access programmatically in Python via the **PyVO** library. TAP queries are written in the **Astronomical Data Query Language (ADQL)** — a SQL-like language designed for astronomical catalogs (see the [ADQL specification](https://www.ivoa.net/documents/latest/ADQL.html)). +The catalogs are served through IRSA’s Virtual Observatory–standard **TAP** [interface](https://irsa.ipac.caltech.edu/docs/program_interface/TAP.html), which you can access programmatically in Python via the **PyVO** library. TAP queries are written in the **Astronomical Data Query Language (ADQL)** — a SQL-like language designed for astronomical catalogs (see the [ADQL specification](https://www.ivoa.net/documents/latest/ADQL.html)). If you are new to PyVO’s query modes, the documentation provides a helpful comparison between **synchronous** and **asynchronous** execution: [PyVO: Synchronous vs. Asynchronous Queries](https://pyvo.readthedocs.io/en/latest/dal/index.html#synchronous-vs-asynchronous-query) @@ -39,7 +39,6 @@ If you are new to PyVO’s query modes, the documentation provides a helpful com - **Avoid overloading the TAP service.** Preferentially use **asynchronous** queries for long running queries to avoid timing out. The whole system will slow down if a lot of people are using it for large queries, or if you decide to kick off many large queries at the same time. - ```{code-cell} ipython3 # Uncomment the next line to install dependencies if needed. # !pip install numpy matplotlib pyvo @@ -50,7 +49,6 @@ import pyvo as vo import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt -import time ``` ```{code-cell} ipython3 @@ -143,15 +141,14 @@ print(len(cone_results)) # Now that we have a list of galaxy redshifts in that region, we can # create a histogram of the redshifts to see what redshifts this survey includes. -if cone_results: - # Plot a histogram - num_bins = 20 - # the histogram of the data - n, bins, patches = plt.hist(cone_results['redshift'], num_bins, - facecolor='blue', alpha = 0.5) - plt.xlabel('Redshift') - plt.ylabel('Number') - plt.title(f'Redshift Histogram {tablename}') +# Plot a histogram +num_bins = 20 +# the histogram of the data +n, bins, patches = plt.hist(cone_results['redshift'], num_bins, + facecolor='blue', alpha = 0.5) +plt.xlabel('Redshift') +plt.ylabel('Number') +plt.title(f'Redshift Histogram {tablename}') ``` We can see form this plot that the simulated galaxies go out to z = 3. @@ -180,25 +177,19 @@ redshift_results ``` ```{code-cell} ipython3 -if redshift_results: - # Construct a 2D histogram of the galaxy colors - plt.hist2d(redshift_results['mag_r_lsst'], redshift_results['color'], - bins=100, cmap='plasma', cmax=500) - - # Plot a colorbar with label. - cb = plt.colorbar() - cb.set_label('Number') +# Construct a 2D histogram of the galaxy colors +plt.hist2d(redshift_results['mag_r_lsst'], redshift_results['color'], + bins=100, cmap='plasma', cmax=500) - # Add title and labels to plot. - plt.xlabel('LSST Mag r') - plt.ylabel('LSST rest-frame g-r color') +# Plot a colorbar with label. +cb = plt.colorbar() +cb.set_label('Number') - # Show the plot. - plt.show() +# Add title and labels to plot. +plt.xlabel('LSST Mag r') +plt.ylabel('LSST rest-frame g-r color') ``` -+++ {"jp-MarkdownHeadingCollapsed": true} - ## 7. Suggestions for further queries: TAP queries are extremely powerful and provide flexible ways to explore large catalogs like CosmoDC2, including spatial searches, photometric selections, cross-matching, and more. However, many valid ADQL queries can take minutes or longer to complete due to the size of the catalog, so we avoid running those directly in this tutorial. Instead, the examples here have so far focused on fast, lightweight queries that illustrate the key concepts without long wait times. If you are interested in exploring further, here are some additional query ideas that are scientifically useful but may take longer to run depending on server conditions. @@ -262,8 +253,4 @@ ORDER BY mag_r_lsst ASC **Contact:** [the IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or reporting problems. -**Runtime:** As of the date above, this notebook takes about 10 seconds to run to completion on a machine with 8GB RAM and 2 CPU. Large variations in this runtime can be expected if the TAP server is busy with many queries at once. - -```{code-cell} ipython3 - -``` +**Runtime:** As of the date above, this notebook takes about 2 minutes to run to completion on a machine with 8GB RAM and 2 CPU. Large variations in this runtime can be expected if the TAP server is busy with many queries at once. From c8599960300a93df7928861bedf75b93d6bcae63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Tue, 16 Dec 2025 15:14:35 -0800 Subject: [PATCH 5/5] Some whitespace and styling fixes --- tutorials/cosmodc2/cosmoDC2_TAP_access.md | 27 +++++++++++++++-------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tutorials/cosmodc2/cosmoDC2_TAP_access.md b/tutorials/cosmodc2/cosmoDC2_TAP_access.md index 7fc53266..ca6c37dd 100644 --- a/tutorials/cosmodc2/cosmoDC2_TAP_access.md +++ b/tutorials/cosmodc2/cosmoDC2_TAP_access.md @@ -191,17 +191,22 @@ plt.ylabel('LSST rest-frame g-r color') ``` ## 7. Suggestions for further queries: -TAP queries are extremely powerful and provide flexible ways to explore large catalogs like CosmoDC2, including spatial searches, photometric selections, cross-matching, and more. However, many valid ADQL queries can take minutes or longer to complete due to the size of the catalog, so we avoid running those directly in this tutorial. Instead, the examples here have so far focused on fast, lightweight queries that illustrate the key concepts without long wait times. If you are interested in exploring further, here are some additional query ideas that are scientifically useful but may take longer to run depending on server conditions. +TAP queries are extremely powerful and provide flexible ways to explore large catalogs like CosmoDC2, including spatial searches, photometric selections, cross-matching, and more. +However, many valid ADQL queries can take minutes or longer to complete due to the size of the catalog, so we avoid running those directly in this tutorial. +Instead, the examples here have so far focused on fast, lightweight queries that illustrate the key concepts without long wait times. +If you are interested in exploring further, here are some additional query ideas that are scientifically useful but may take longer to run depending on server conditions. ### Count the total number of redshifts in the chosen table -answer for this table= 597,488,849 redshifts -``` +The answer for the `'cosmodc2mockv1_heavy'` table is 597,488,849 redshifts. + +```sql adql = f"SELECT count(redshift) FROM {tablename}" ``` ### Count galaxies in a sky region (cone search) -Useful for: estimating source density, validating spatial footprint, testing spatial completeness -``` +Generally useful for: estimating source density, validating spatial footprint, testing spatial completeness. + +```sql adql = f""" SELECT COUNT(*) FROM {tablename} @@ -209,8 +214,10 @@ WHERE CONTAINS(POINT('ICRS', ra, dec), CIRCLE('ICRS', 54.2, -37.5, 0.2)) = 1 """ ``` -### Retrieve only a subset of columns (recommended for speed) -This use of "TOP 5000" just limits the number of rows returned. Remove it if you want all rows +### Retrieve only a subset of columns (recommended for speed) and rows +This use of "TOP 5000" just limits the number of rows returned. +Remove it if you want all rows, but keep in mind such a query can take a much longer time. + ```sql adql = f""" SELECT TOP 5000 @@ -222,6 +229,7 @@ FROM {tablename}""" ``` ### Explore the stellar–halo mass relation + ```sql adql = f""" SELECT TOP 500000 @@ -232,7 +240,8 @@ WHERE halo_mass > 1e11""" ``` ### Find the brightest galaxies at high redshift -return the results in ascending (ASC) order by r band magnitude +Return the results in ascending (ASC) order by r band magnitude. + ```sql adql = f""" SELECT TOP 10000 @@ -249,7 +258,7 @@ ORDER BY mag_r_lsst ASC **Author:** IRSA Data Science Team, including Vandana Desai, Jessica Krick, Troy Raen, Brigitta Sipőcz, Andreas Faisst, Jaladh Singhal -**Updated:** December 2025 +**Updated:** 2025-12-16 **Contact:** [the IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or reporting problems.