Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/lint_component.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ jobs:
- name: Convert component meta.yml to markdown
run: |
poetry run nf-neuro-convert-${{ inputs.type }} \
--enhance-keywords \
${{ inputs.type }}s/nf-neuro/${{ inputs.component }} \
${{ github.sha }} \
$(echo "${{ inputs.component }}" | sed 's/\//_/g').md
12 changes: 12 additions & 0 deletions docs/astro/convert_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
li,
link,
)
from docs.astro.keywords import DEFAULT_MODEL, extract_keywords


def _create_parser():
Expand All @@ -23,6 +24,14 @@ def _create_parser():
p.add_argument('module_path', help='Path to the module')
p.add_argument('current_commit_sha', help='Current commit sha')
p.add_argument('output', help='Name of the output markdown file')
p.add_argument(
'--enhance-keywords', action='store_true', default=False,
help='Use an LLM via Ollama to extract additional SEO keywords'
)
p.add_argument(
'--llm-model', default=DEFAULT_MODEL, metavar='MODEL',
help=f'Ollama model used for keyword extraction (default: {DEFAULT_MODEL})'
)

return p

Expand Down Expand Up @@ -51,6 +60,9 @@ def main():
data["currentcommit"] = args.current_commit_sha
data["currentdate"] = datetime.datetime.now().strftime("%Y-%m-%d")

if args.enhance_keywords:
data["keywords"] = extract_keywords(data, model=args.llm_model)

template = env.get_template('module.md.jinja2')
output_path = Path(args.output)
output_path.write_text(template.render(**data))
Expand Down
12 changes: 12 additions & 0 deletions docs/astro/convert_subworkflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
link,
sanitize_outside_codeblocks,
)
from docs.astro.keywords import DEFAULT_MODEL, extract_keywords


DOC_URL_BASE = "https://nf-neuro.github.io"
Expand Down Expand Up @@ -71,6 +72,14 @@ def _create_parser():
p.add_argument('subworkflow_path', help='Name of the subworkflow')
p.add_argument('current_commit_sha', help='Current commit sha')
p.add_argument('output', help='Name of the output markdown file')
p.add_argument(
'--enhance-keywords', action='store_true', default=False,
help='Use an LLM via Ollama to extract additional SEO keywords'
)
p.add_argument(
'--llm-model', default=DEFAULT_MODEL, metavar='MODEL',
help=f'Ollama model used for keyword extraction (default: {DEFAULT_MODEL})'
)

return p

Expand Down Expand Up @@ -99,6 +108,9 @@ def main():
data["currentcommit"] = args.current_commit_sha
data["currentdate"] = datetime.datetime.now().strftime("%Y-%m-%d")

if args.enhance_keywords:
data["keywords"] = extract_keywords(data, model=args.llm_model)

template = env.get_template('subworkflow.md.jinja2')
output_path = Path(args.output)
output_path.write_text(template.render(**data))
Expand Down
122 changes: 122 additions & 0 deletions docs/astro/keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
LLM-powered keyword extraction for nf-neuro documentation.

Uses Ollama with the qwen3 model to generate additional relevant keywords
from meta.yml data, improving discoverability on the website and in search
engines.
"""

import json
import logging

log = logging.getLogger(__name__)

DEFAULT_MODEL = "qwen3"


def _build_prompt(data):
"""Build a keyword-extraction prompt from meta.yml data."""
name = data.get("name", "")
description = data.get("description", "")
existing_keywords = data.get("keywords", [])
tools = data.get("tools", [])

tool_names = []
tool_descriptions = []
for tool in tools:
for tool_name, tool_meta in tool.items():
tool_names.append(tool_name)
if isinstance(tool_meta, dict) and "description" in tool_meta:
tool_descriptions.append(
f"{tool_name}: {tool_meta['description'].strip()}"
)

prompt = (
"You are a scientific SEO expert specialising in neuroimaging and "
"bioinformatics software.\n\n"
"Given the following information about a Nextflow module for neuroimaging "
"data processing, extract a list of relevant keywords for SEO and search "
"discoverability. Focus on technical terms, neuroimaging concepts, "
"computational methods, data types, and scientific domains relevant to "
"the module.\n\n"
f"Module name: {name}\n"
f"Description: {description}\n"
f"Existing keywords: {', '.join(existing_keywords)}\n"
f"Tools used: {', '.join(tool_names)}\n"
f"Tool descriptions: {'; '.join(tool_descriptions)}\n\n"
"Return ONLY a JSON array of 5 to 15 additional keyword strings that are "
"NOT already present in the existing keywords list. Keywords should be "
"specific, relevant, and useful for search engines. Do not include "
"explanations or any other text outside the JSON array.\n\n"
'Example format: ["keyword1", "keyword2", "keyword3"]'
)
return prompt


def extract_keywords(data, model=DEFAULT_MODEL):
"""Extract additional keywords from meta.yml data using an LLM via Ollama.

Calls the specified Ollama model to generate SEO-relevant keywords that
complement the existing ones defined in the meta.yml file. Falls back to
the original keyword list gracefully when Ollama is unavailable or the
model call fails.

Parameters
----------
data : dict
Parsed meta.yml data.
model : str, optional
Ollama model name to use for keyword extraction (default: ``qwen3``).

Returns
-------
list[str]
Augmented list of keywords combining the original entries with any
additional ones produced by the LLM, deduplicated and in order.
"""
existing_keywords = data.get("keywords", []) or []

try:
import ollama

prompt = _build_prompt(data)
response = ollama.chat(
model=model,
messages=[{"role": "user", "content": prompt}],
options={"temperature": 0.2},
)
content = response.message.content.strip()

# Locate the first JSON array in the response using bracket matching
# to handle nested arrays and avoid capturing partial results.
start = content.find("[")
if start != -1:
depth = 0
end = -1
for i, ch in enumerate(content[start:], start):
if ch == "[":
depth += 1
elif ch == "]":
depth -= 1
if depth == 0:
end = i + 1
break
if end != -1:
new_keywords = json.loads(content[start:end])
if isinstance(new_keywords, list):
existing_lower = {k.lower() for k in existing_keywords}
additional = [
k
for k in new_keywords
if isinstance(k, str) and k.lower() not in existing_lower
]
return existing_keywords + additional

log.warning(
"LLM response did not contain a parseable JSON keyword array; "
"using original keywords."
)
except Exception as exc:
log.warning("LLM keyword extraction failed (%s); using original keywords.", exc)

return existing_keywords
2 changes: 1 addition & 1 deletion modules/nf-neuro/bundle/bundleparc/meta.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: bundle_bundleparc
description: process bundleparc
description: Extract label maps of bundles using the bundleparc machine learning model.
keywords:
- Tractography
- Bundleparc
Expand Down
Loading
Loading