CDCgov · dthoward96 · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025
diff --git a/README.Rmd b/README.Rmd
@@ -26,7 +26,7 @@ github_pages_url <- description$GITHUB_PAGES
 
 <p style="font-size: 16px;"><em>Public Database Submission Pipeline</em></p>
 
-**Beta Version**: v1.3.2. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome! 
+**Beta Version**: v1.3.3. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome! 
 
 **General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm).  GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.
 

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 
 <!-- ![GitHub last commit](https://img.shields.io/github/last-commit/montilab/cadra) -->
 
-**Beta Version**: 1.3.2. This pipeline is currently in Beta testing, and
+**Beta Version**: 1.3.3. This pipeline is currently in Beta testing, and
 issues could appear during submission. Please use it at your own risk.
 Feedback and suggestions are welcome\!
 

diff --git a/argument_handler.py b/argument_handler.py
@@ -130,6 +130,13 @@ def args_parser():
 		description="Downloads the BioSample Package XML from NCBI and updates SeqSender's metadata schema options for the BioSample database."
 	)
 
+	# check_submission_status command
+	test_network_module = subparser_modules.add_parser(
+		"test_network_connection",
+		formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+		description="Runs a series of test network connections to NCBI and GISAID to troubleshoot submission issues."
+	)
+
 	# version command
 	version_module = subparser_modules.add_parser(
 		"version",

diff --git a/config/genbank/genbank_schema.py b/config/genbank/genbank_schema.py
@@ -49,7 +49,7 @@
 			description="Optional internal field for how the GenBank submission should be named when viewed from the NCBI submission portal, . If not provided, when performing submissions <--submission_name> with the suffix \"-GB\" will be used instead.",
 			title="genbank submission portal title",
 		),
-		"sra-comment": Column(
+		"gb-comment": Column(
 			dtype="object",
 			checks=[
 				Check(lambda s: s.nunique() == 1),

diff --git a/docs/app.json b/docs/app.json
diff --git a/ncbi_handler.py b/ncbi_handler.py
@@ -20,6 +20,7 @@
 
 # Local imports
 import tools
+import setup
 
 # Process NCBI Report file
 def get_ncbi_report(database: str, submission_name: str, submission_dir: str, config_dict: Dict[str, Any], submission_type: str) -> Optional[str]:
@@ -63,8 +64,16 @@ def create_submit_ready_file(ftp, submission_dir: str):
 	return ftp
 
 def ncbi_login(config_dict: Dict[str, Any]):
-	ftp = ftplib.FTP(NCBI_FTP_HOST)
-	ftp.login(user=config_dict["Username"], passwd=config_dict["Password"])
+	try:
+		ftp = ftplib.FTP(NCBI_FTP_HOST)
+		ftp.login(user=config_dict["Username"], passwd=config_dict["Password"])
+	except ftplib.error_perm as err:
+		print(f"Error: login error. Possible incorrect credentials for NCBI FTP site in config file. \nException{err}", file=sys.stderr)
+	except Exception as err:
+		print("Error unable to connect to FTP site. Running network test...", file=sys.stderr)
+		setup.test_internet_connection(databases=["NCBI"])
+		print(f"Exception: {err}", file=sys.stderr)
+		sys.exit(1)
 	return ftp
 
 def ftp_upload_file(ftp, upload_file: str, upload_name: Optional[str] = None):

diff --git a/seqsender.py b/seqsender.py
@@ -177,6 +177,8 @@ def main():
 	elif command == "update_biosample":
 		print("Updating BioSample requirements.", file=sys.stdout)
 		setup.download_biosample_xml_list()
+	elif command == "test_network_connection":
+		setup.test_internet_connection(databases=["GENERAL","NCBI","GISAID"])
 	else:
 		# If no command display help
 		parser.print_help()

diff --git a/settings.py b/settings.py
@@ -12,7 +12,7 @@
 PROG_DIR: str = os.path.dirname(os.path.abspath(__file__))
 
 # SeqSender version
-VERSION: str = "1.3.2 (Beta)"
+VERSION: str = "1.3.3 (Beta)"
 
 # Organism options with unique submission options
 ORGANISM_CHOICES: List[str] = ["FLU", "COV", "POX", "ARBO", "RSV", "OTHER"]

diff --git a/setup.py b/setup.py
@@ -4,13 +4,15 @@
 import sys
 from zipfile import ZipFile
 import ftplib
+import io
 import os
 import json
 import subprocess
+import socket
 import pandas as pd
 import shutil
 import platform
-from urllib.request import urlopen
+import urllib
 import gzip
 import stat
 import requests
@@ -23,6 +25,7 @@
 # Local imports
 sys.path.insert(0, str(pathlib.Path(__file__).parent))
 import tools
+from settings import NCBI_FTP_HOST
 
 # Get program directory
 PROG_DIR: str = os.path.dirname(os.path.abspath(__file__))
@@ -95,6 +98,14 @@
 			title=\"biosample submission portal description\",
 		)"""
 
+TEST_CONNECTIONS = {"HTTP": {"website":"http://www.google.com", "database": "GENERAL", "error_msg": "Possible internet connectivity issues; unable to connect to 'http://www.google.com'."},
+"HTTPS": {"website": "https://www.google.com", "database": "GENERAL", "error_msg": "Possible internet connectivity issues; unable to connect to 'https://www.google.com'."},
+"NCBI": {"website": "https://www.ncbi.nlm.nih.gov", "database": "NCBI", "error_msg": "Unable to connect to 'https://www.ncbi.nlm.nih.gov'; ensure NCBI services are running and you are able to connect to them before proceeding."},
+"NCBI API": {"website": "https://submit.ncbi.nlm.nih.gov", "database": "NCBI", "error_msg": "Unable to connect to 'https://submit.ncbi.nlm.nih.gov'; ensure NCBI services are running and you are able to connect to them before proceeding."},
+"GISAID": {"website": "https://www.epicov.org/epi3/start", "database": "GISAID", "error_msg": "Unable to connect to 'https://www.epicov.org/epi3'; ensure GISAID services are running and you are able to connect to them before proceeding."},
+"GISAID": {"website": "https://gisaid.org/", "database": "GISAID", "error_msg": "Unable to connect to 'https://www.epicov.org/epi3'; ensure GISAID services are running and you are able to connect to them before proceeding."}
+}
+
 # Create example data for testing
 def create_test_data(organism: str, database: List[str], submission_dir: str) -> None:
 	if organism not in ["FLU", "COV"]:
@@ -154,7 +165,7 @@ def download_table2asn(table2asn_dir: str) -> None:
 	# Determine which platform to download table2asn
 	if platform.system() == "Windows":
 		zip_url = "https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/table2asn/win64.table2asn.zip"
-		with urlopen(zip_url) as zip_response:
+		with urllib.request.urlopen(zip_url) as zip_response:
 			with ZipFile(BytesIO(zip_response.read())) as zip_file:
 				zip_file.extractall(table2asn_dir)
 		return
@@ -168,7 +179,7 @@ def download_table2asn(table2asn_dir: str) -> None:
 	# Extract table2asn to tmp folder
 	try:
 		with open(table2asn_dir, "wb") as file:
-			with urlopen(zip_url) as zip_response:
+			with urllib.request.urlopen(zip_url) as zip_response:
 				file.write(gzip.decompress(zip_response.read()))
 		st = os.stat(table2asn_dir)
 		os.chmod(table2asn_dir, st.st_mode | stat.S_IXOTH | stat.S_IRWXU)
@@ -309,3 +320,42 @@ def biosample_package_to_pandera_schema(xml_file: str, name: str) -> None:
 		indentation = indentation[:-1]
 		file.write(indentation + ")")
 	os.remove(xml_file)
+
+def test_internet_connection(databases: List[str]) -> None:
+	error_list = []
+	print("Checking network settings...", file=sys.stdout)
+	for test, info in TEST_CONNECTIONS.items():
+		if info["database"] == "GENERAL" or info["database"] in databases:
+			print(f"Checking {test} connection...", file=sys.stdout)
+			try:
+				query = requests.get(info['website'])
+				response = query.status_code
+			except Exception as e:
+				error_list.append(f"{test} connectivity test failed for '{info['website']}'. Check possible firewall issues. \nException:{e}")
+			if response in (200, 204, 301, 302):
+				print(f"{test} '{info['website']}' connectivity test ok.", file=sys.stdout)
+			else:
+				error_list.append(f"{info['error_msg']} Error code received:'{response}'")
+	if "NCBI" in databases:
+		print("Checking DNS resolution for FTP site...", file=sys.stdout)
+		try:
+			ip_address = socket.gethostbyname(NCBI_FTP_HOST)
+		except Exception as e:
+			error_list.append(f"Unable to reach '{NCBI_FTP_HOST}'; possible DNS error. \nException:{e}")
+		if not ip_address:
+			error_list.append(f"Unable to resolve address for '{NCBI_FTP_HOST}'; check DNS server settings for possible issues.")
+		else:
+			print(f"DNS resolution test ok. Able to reach ('{NCBI_FTP_HOST} -> {ip_address})", file=sys.stdout)
+			print("Checking port status...", file=sys.stdout)
+			try:
+				ftp = ftplib.FTP()
+				ftp.connect(NCBI_FTP_HOST, 21, timeout=10)
+				ftp.quit()
+				print(f"{NCBI_FTP_HOST} open on port 21.", file=sys.stdout)
+			except Exception as e:
+				error_list.append(f"Port 21 not open for {NCBI_FTP_HOST}. Check possible firewall/server issues. \nException:{e}")
+	if error_list:
+		for error_string in error_list:
+			print(error_string, file=sys.stderr)
+	else:
+		print("No network connection issues detected.", file=sys.stdout)
diff --git a/shiny/app.py b/shiny/app.py
@@ -20,7 +20,7 @@
 header = (
     ui.card_header(
         ui.HTML(
-            """<p><strong>Beta Version</strong>: 1.3.2. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome!</p>"""
+            """<p><strong>Beta Version</strong>: 1.3.3. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome!</p>"""
         )
     ),
 )
@@ -583,6 +583,8 @@
         shiny_tools.command_accordion_panel("test_data", description=" command is used generate test data for seqsender, to be used for testing the prep and submit commands."),
         # Update biosample command
         shiny_tools.command_accordion_panel("update_biosample", description=" command is used to update biosample schema options based on available BioSample Packages."),
+        # Update biosample command
+        shiny_tools.command_accordion_panel("test_network_connection", description=" command is used to run a series of test network connections to NCBI and GISAID to troubleshoot submission issues."),
         # version command
         shiny_tools.command_accordion_panel("version", description=" command prints the current seqsender version."),
     ),
@@ -609,6 +611,8 @@
         ui.nav_panel("Output Files", output_body),
         ui.nav_panel("Commands", commands_body),
         # ui.nav_panel("FAQ", faq_body),
+        ui.nav_spacer(),
+        ui.nav_control(ui.a("GitHub", href="https://github.com/CDCgov/seqsender/", target="_blank")),
         selected="SeqSender",
         header=header,
         footer=footer,
@@ -668,6 +672,19 @@ def read_biosample_file():
     @reactive.file_reader(dir / "templates/")
     def read_genbank_file():
         df = pd.read_csv(dir / "templates/config.genbank.genbank.schema_template.csv", index_col = "column_name")
+        if input.GenBank_schemas() == "FLU":
+            src_df = pd.read_csv(dir / "templates/config.genbank.genbank.flu.src.schema_template.csv", index_col = "column_name")
+        else:
+            src_df = pd.read_csv(dir / "templates/config.genbank.genbank.src.schema_template.csv", index_col = "column_name")
+        cmt_df = pd.DataFrame({
+            "column_name": ["cmt-StructuredCommentPrefix", "cmt-StructuredCommentSuffix", "cmt-Assembly Method"],
+            "required_column": ["Required", "Required", "Required"],
+            "description": ["Structured comment keyword. ONLY REQUIRED IF INCLUDING COMMENT FILE. For FLU use 'FluData', HIV use 'HIV-DataBaseData', and for COV and other organisms use 'Assembly-Data'.",
+            "Structured comment keyword. ONLY REQUIRED IF INCLUDING COMMENT FILE. For FLU use 'FluData', HIV use 'HIV-DataBaseData', and for COV and other organisms use 'Assembly-Data'.",
+            "ONLY REQUIRED IF INCLUDING COMMENT FILE. Process used to assemble genome."]
+        })
+        cmt_df = cmt_df.set_index("column_name")
+        df = pd.concat([df, src_df, cmt_df])
         df = df.fillna("")
         df = df.transpose()
         return df

diff --git a/shiny/templates/config.genbank.genbank.schema_template.csv b/shiny/templates/config.genbank.genbank.schema_template.csv
@@ -3,4 +3,4 @@ sequence_name,Required,Sequence identifier used in fasta file. This is used to c
 gb-sample_name,Required,Identifier name used for GenBank. Max length is 50 characters.
 gb-fasta_definition_line_modifiers,Optional,"NCBI fasta definition line modifiers can be added here. As many modifiers as you like can be added, but each must bounded by a set of brackets. Some of the available keywords are listed at ""https://www.ncbi.nlm.nih.gov/genbank/mods_fastadefline/""."
 gb-title,Optional,"Optional internal field for how the GenBank submission should be named when viewed from the NCBI submission portal, . If not provided, when performing submissions <--submission_name> with the suffix ""-GB"" will be used instead."
-sra-comment,Optional,Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.
+gb-comment,Optional,Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.