From ffe63306c4ae3855463173896f36f09e908f5d8b Mon Sep 17 00:00:00 2001
From: KennyG <kennyg@kennyg.com>
Date: Fri, 13 Mar 2026 12:14:09 -0400
Subject: [PATCH] Add GroupAutoScraper plugin: Automatically re-scrape groups
 with Adult Empire URLs to update tags and studio information. Includes main
 script, configuration files, and README documentation.

---
 plugins/GroupAutoScraper/GroupAutoScraper.yml |  14 +
 plugins/GroupAutoScraper/README.md            |  61 +++
 plugins/GroupAutoScraper/autoScraper.py       | 387 ++++++++++++++++++
 plugins/GroupAutoScraper/manifest             |  13 +
 4 files changed, 475 insertions(+)
 create mode 100644 plugins/GroupAutoScraper/GroupAutoScraper.yml
 create mode 100644 plugins/GroupAutoScraper/README.md
 create mode 100644 plugins/GroupAutoScraper/autoScraper.py
 create mode 100644 plugins/GroupAutoScraper/manifest

diff --git a/plugins/GroupAutoScraper/GroupAutoScraper.yml b/plugins/GroupAutoScraper/GroupAutoScraper.yml
new file mode 100644
index 00000000..cc9fdde9
--- /dev/null
+++ b/plugins/GroupAutoScraper/GroupAutoScraper.yml
@@ -0,0 +1,14 @@
+name: GroupAutoScraper
+description: Automatically re-scrape groups that have an Adult Empire URL to pickup tags and studio for the group.
+url: https://github.com/stashapp/CommunityScripts
+version: 1.1.0
+exec:
+  - python
+  - "{pluginDir}/autoScraper.py"
+interface: raw
+hooks:
+  - name: hook_group_auto_scraper
+    description: Re-scrape group on create when it has a URL.
+    triggeredBy:
+      - Group.Create.Post
+
diff --git a/plugins/GroupAutoScraper/README.md b/plugins/GroupAutoScraper/README.md
new file mode 100644
index 00000000..f0962136
--- /dev/null
+++ b/plugins/GroupAutoScraper/README.md
@@ -0,0 +1,61 @@
+# GroupAutoScraper
+
+Automatically re-scrape groups that have a supported URL and merge the scraped data back into the group.
+
+## What it does
+
+- **Trigger**
+  - Listens to the **`Group.Create.Post`** hook only.
+- **URL filter**
+  - If the group has no URLs, the plugin exits quietly (no changes).
+  - If the first URL does **not** contain `adultdvdempire.com/`, the plugin logs:
+    - `AutoGroup only uses AdultDVDEmpire URLS. Exiting.`
+    and exits without making any changes.
+- **Scrape + merge**
+  - When the first URL *does* contain `adultdvdempire.com/`:
+    - Calls `scrapeGroupURL(url)` for that URL.
+    - Merges scraped data into the group and performs a `GroupUpdate`:
+      - Uses scraped values when present, otherwise keeps existing values.
+      - Uses `scraped.studio.stored_id` as `studio_id` only when it is not `null`.
+      - Builds `tag_ids` from:
+        - existing group tag IDs, plus
+        - scraped tag entries where `stored_id` is not `null`,
+        - then de-duplicates.
+      - Only sends `front_image` / `back_image` when present in the scrape result so existing images are not overwritten with `null`.
+- **Summary logging**
+  - On a successful update, the plugin logs a concise summary, e.g.:
+    - `Group 9681 'Women Seeking Women Vol. 101' updated. Added 4 tag(s), set studio.`
+  - If a studio name is scraped but cannot be resolved (no `stored_id`), the message instead reads:
+    - `Group 9681 'Some Title' updated. Added 3 tag(s), could not set studio 'Some Studio', not found in studios.`
+
+Groups without any URL, or with non-AdultDVD Empire URLs, are ignored without error.
+
+## Installation
+
+1. Copy this folder to your Stash plugins directory, typically:
+
+   - `plugins/CJ_CommunityScripts/plugins/GroupAutoScraper/`
+
+2. Ensure the following files exist in this directory:
+
+   - `manifest`
+   - `GroupAutoScraper.yml`
+   - `autoScraper.py`
+   - `README.md`
+
+3. In Stash, open **Settings → Plugins** and reload or restart Stash so the plugin is detected.
+
+You should then see **GroupAutoScraper** listed with a hook that triggers on `Group.Create.Post`.
+
+## Configuration
+
+This plugin intentionally uses the **server connection information provided by Stash**:
+
+- GraphQL URL, scheme, host and port come from the plugin input.
+- Authentication uses the Stash session cookie provided in `server_connection`.
+
+As a result:
+
+- **No API keys or URLs need to be hard-coded or edited in the script.**
+- The plugin should work across environments as long as it is installed in the correct plugins directory.
+
diff --git a/plugins/GroupAutoScraper/autoScraper.py b/plugins/GroupAutoScraper/autoScraper.py
new file mode 100644
index 00000000..5dae956d
--- /dev/null
+++ b/plugins/GroupAutoScraper/autoScraper.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+"""
+autoScraper.py
+
+External raw plugin for Stash that:
+- Triggers on group hooks (e.g. Group.Create.Post).
+- If the group has at least one URL, calls ScrapeGroupURL on the first URL.
+- Merges scraped data back into the group via GroupUpdate:
+  * Uses scraped values when present, otherwise keeps existing ones.
+  * For studio/tags, only uses scraped entries where stored_id is not null.
+  * Tag ids from scraped data are merged with existing tag ids (unique).
+
+This script is designed to be run by Stash as a raw external plugin and
+expects its input JSON on stdin (the standard Stash plugin FRAGMENT format).
+
+Requires:
+  - Python 3.7+
+  - requests (pip install requests)
+"""
+
+import sys
+import json
+import time
+from typing import Any, Dict, List, Optional
+
+import requests
+import stashapi.log as log
+from stashapi.stashapp import StashInterface
+
+
+START_TIME = time.time()
+
+
+def exit_plugin(msg: Optional[str] = None, err: Optional[str] = None) -> None:
+    if msg is None and err is None:
+        msg = "plugin ended"
+    log.debug(f"Execution time: {round(time.time() - START_TIME, 5)}s")
+    output_json = {"output": msg, "error": err}
+    print(json.dumps(output_json))
+    sys.exit(0 if err is None else 1)
+
+
+def load_fragment() -> Dict[str, Any]:
+    try:
+        raw = sys.stdin.read()
+        fragment = json.loads(raw)
+    except Exception as exc:
+        log.error(f"Failed to read/parse plugin input: {exc}")
+        exit_plugin(err="invalid plugin input")
+    return fragment
+
+
+def build_graphql_client(server: Dict[str, Any]) -> Dict[str, Any]:
+    scheme = server.get("Scheme", "http")
+    host = server.get("Host", "localhost")
+    port = str(server.get("Port", "9999"))
+    if host == "0.0.0.0":
+        host = "localhost"
+
+    url = f"{scheme}://{host}:{port}/graphql"
+    cookies = {}
+    session = server.get("SessionCookie") or {}
+    if session.get("Value"):
+        cookies["session"] = session["Value"]
+
+    headers = {
+        "Accept-Encoding": "gzip, deflate, br",
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+        "Connection": "keep-alive",
+        "DNT": "1",
+    }
+
+    return {"url": url, "headers": headers, "cookies": cookies}
+
+
+def graphql_request(
+    client: Dict[str, Any], query: str, variables: Dict[str, Any]
+) -> Dict[str, Any]:
+    payload = {"query": query, "variables": variables}
+    try:
+        resp = requests.post(
+            client["url"],
+            json=payload,
+            headers=client["headers"],
+            cookies=client["cookies"],
+            timeout=20,
+        )
+    except Exception as exc:
+        log.error(f"Error calling GraphQL: {exc}")
+        exit_plugin(err="graphql request failed")
+
+    if resp.status_code != 200:
+        log.error(
+            f"GraphQL HTTP {resp.status_code}: {resp.content!r}"
+        )
+        exit_plugin(err="graphql http error")
+
+    data = resp.json()
+    if "errors" in data and data["errors"]:
+        log.error(f"GraphQL errors: {data['errors']}")
+        exit_plugin(err="graphql errors")
+    return data.get("data", {})
+
+
+def seconds_from_duration(duration: Optional[str]) -> Optional[int]:
+    """
+    Convert a duration string like "3:16:00" or "16:00" into seconds.
+    Returns None if duration is falsy or cannot be parsed.
+    """
+    if not duration:
+        return None
+    parts = duration.split(":")
+    if not all(p.isdigit() for p in parts):
+        return None
+    try:
+        if len(parts) == 3:
+            h, m, s = map(int, parts)
+        elif len(parts) == 2:
+            h = 0
+            m, s = map(int, parts)
+        elif len(parts) == 1:
+            h = 0
+            m = 0
+            s = int(parts[0])
+        else:
+            return None
+    except ValueError:
+        return None
+    return h * 3600 + m * 60 + s
+
+
+def coalesce(new_val: Any, old_val: Any) -> Any:
+    """Return new_val if it is not None, otherwise old_val."""
+    return new_val if new_val is not None else old_val
+
+
+def build_group_update_input(
+    group_id: int,
+    existing: Dict[str, Any],
+    scraped: Dict[str, Any],
+) -> Dict[str, Any]:
+    """
+    Build the GroupUpdateInput payload, merging scraped data with existing.
+    """
+    input_obj: Dict[str, Any] = {"id": str(group_id)}
+
+    # Basic scalar fields
+    input_obj["name"] = coalesce(scraped.get("name"), existing.get("name"))
+
+    # aliases: scraped may be list or string; convert list -> comma separated string
+    scraped_aliases = scraped.get("aliases")
+    if isinstance(scraped_aliases, list):
+        aliases_str = ", ".join(a for a in scraped_aliases if a)
+    else:
+        aliases_str = scraped_aliases
+    input_obj["aliases"] = coalesce(aliases_str, existing.get("aliases") or "")
+
+    # duration: convert scraped duration string to seconds; keep existing if scrape missing
+    scraped_duration_seconds = seconds_from_duration(scraped.get("duration"))
+    if scraped_duration_seconds is not None:
+        input_obj["duration"] = scraped_duration_seconds
+    elif existing.get("duration") is not None:
+        input_obj["duration"] = existing.get("duration")
+
+    input_obj["date"] = coalesce(scraped.get("date"), existing.get("date"))
+
+    # Director
+    input_obj["director"] = coalesce(scraped.get("director"), existing.get("director"))
+
+    # URLs: prefer scraped urls when non-empty
+    scraped_urls = scraped.get("urls") or []
+    existing_urls = existing.get("urls") or []
+    if scraped_urls:
+        input_obj["urls"] = scraped_urls
+    elif existing_urls:
+        input_obj["urls"] = existing_urls
+
+    # Synopsis
+    input_obj["synopsis"] = coalesce(scraped.get("synopsis"), existing.get("synopsis"))
+
+    # Studio: use scraped.studio.stored_id when present, else existing studio.id
+    existing_studio = existing.get("studio") or {}
+    existing_studio_id = existing_studio.get("id")
+    scraped_studio = scraped.get("studio") or {}
+    scraped_studio_id = scraped_studio.get("stored_id")
+    studio_id = coalesce(scraped_studio_id, existing_studio_id)
+    if studio_id is not None:
+        input_obj["studio_id"] = str(studio_id)
+
+    # Tags: union of existing tag ids and scraped tags with stored_id, filtering nulls
+    existing_tags = existing.get("tags") or []
+    existing_tag_ids: List[str] = [str(t.get("id")) for t in existing_tags if t.get("id") is not None]
+
+    scraped_tags = scraped.get("tags") or []
+    scraped_tag_ids: List[str] = [
+        str(t.get("stored_id"))
+        for t in scraped_tags
+        if t.get("stored_id") is not None
+    ]
+
+    if existing_tag_ids or scraped_tag_ids:
+        merged_ids: List[str] = []
+        for tid in existing_tag_ids + scraped_tag_ids:
+            if tid not in merged_ids:
+                merged_ids.append(tid)
+        input_obj["tag_ids"] = merged_ids
+
+    # Images: only send when we actually have scraped data URIs; otherwise omit so we
+    # don't overwrite existing images with null.
+    front_image = scraped.get("front_image")
+    if front_image:
+        input_obj["front_image"] = front_image
+    back_image = scraped.get("back_image")
+    if back_image:
+        input_obj["back_image"] = back_image
+
+    return input_obj
+
+
+def main() -> None:
+    fragment = load_fragment()
+    server = fragment.get("server_connection") or {}
+    client = build_graphql_client(server)
+    # Create StashInterface instance for consistency with other plugins,
+    # even though this plugin currently uses direct GraphQL requests.
+    _stash = StashInterface(server)
+
+    args = fragment.get("args") or {}
+
+    # When triggered by a hook, we get hookContext with type/id
+    hook_ctx = args.get("hookContext") or {}
+    hook_type = hook_ctx.get("type")
+    hook_id = hook_ctx.get("id")
+
+    if not hook_type or not hook_id:
+        # Not a hook invocation – nothing to do.
+        exit_plugin("No hook context; skipping.")
+
+    if hook_type not in ("Group.Create.Post", "Group.Update.Post"):
+        # Only act on group create/update
+        exit_plugin(f"Ignoring hook type {hook_type}")
+
+    try:
+        group_id = int(hook_id)
+    except (TypeError, ValueError):
+        log.error(f"Invalid group id in hookContext: {hook_id!r}")
+        exit_plugin(err="invalid group id")
+
+    log.debug(f"Running GroupAutoScraper for group id {group_id} ({hook_type})")
+
+    # 1. Fetch existing group
+    find_group_query = """
+    query FindGroup($id: ID!) {
+      findGroup(id: $id) {
+        id
+        name
+        aliases
+        duration
+        date
+        director
+        urls
+        synopsis
+        front_image_path
+        back_image_path
+        studio {
+          id
+        }
+        tags {
+          id
+        }
+        containing_groups {
+          group {
+            id
+          }
+          description
+        }
+      }
+    }
+    """
+
+    data = graphql_request(client, find_group_query, {"id": str(group_id)})
+    group = data.get("findGroup")
+    if not group:
+        log.error(f"No group found with id {group_id}")
+        exit_plugin(err="group not found")
+
+    urls = group.get("urls") or []
+    if not urls:
+        # Nothing to scrape, but not an error
+        log.info(f"Group {group_id} has no URLs; nothing to do.")
+        exit_plugin("group has no URLs; skipped")
+
+    target_url = urls[0]
+
+    # Only handle AdultDVD Empire URLs
+    if "adultdvdempire.com/" not in target_url:
+        log.info("AutoGroup only uses AdultDVDEmpire URLS. Exiting.")
+        exit_plugin("non-AdultDVDEmpire URL; skipped")
+
+    # 2. Scrape group URL
+    scrape_query = """
+    query ScrapeGroupURL($url: String!) {
+      scrapeGroupURL(url: $url) {
+        name
+        aliases
+        duration
+        date
+        rating
+        director
+        urls
+        synopsis
+        front_image
+        back_image
+        studio {
+          stored_id
+          name
+          urls
+        }
+        tags {
+          stored_id
+          name
+          remote_site_id
+        }
+      }
+    }
+    """
+
+    scrape_data = graphql_request(client, scrape_query, {"url": target_url})
+    scraped = scrape_data.get("scrapeGroupURL")
+    if not scraped:
+        log.error(f"ScrapeGroupURL returned no data for URL {target_url}")
+        exit_plugin(err="scrapeGroupURL returned no data")
+
+    # 3. Build GroupUpdate input
+    # Compute tag additions and studio status for logging.
+    existing_tags = group.get("tags") or []
+    existing_tag_ids = {str(t.get("id")) for t in existing_tags if t.get("id") is not None}
+
+    scraped_tags = scraped.get("tags") or []
+    scraped_tag_ids = [
+        str(t.get("stored_id"))
+        for t in scraped_tags
+        if t.get("stored_id") is not None
+    ]
+    tags_added_count = sum(1 for tid in scraped_tag_ids if tid not in existing_tag_ids)
+
+    scraped_studio = scraped.get("studio") or {}
+    scraped_studio_name = scraped_studio.get("name")
+    scraped_studio_id = scraped_studio.get("stored_id")
+    if scraped_studio_id is not None:
+        studio_msg = "set studio"
+    elif scraped_studio_name:
+        studio_msg = f"could not set studio '{scraped_studio_name}', not found in studios"
+    else:
+        studio_msg = "no studio in scrape"
+
+    update_input = build_group_update_input(group_id, group, scraped)
+
+    # 4. Perform GroupUpdate
+    update_query = """
+    mutation GroupUpdate($input: GroupUpdateInput!) {
+      groupUpdate(input: $input) {
+        id
+        name
+      }
+    }
+    """
+
+    result = graphql_request(client, update_query, {"input": update_input})
+    updated = result.get("groupUpdate")
+    if not updated:
+        log.error("GroupUpdate did not return a group")
+        exit_plugin(err="groupUpdate failed")
+
+    log.info(
+        f"Group {updated.get('id')} '{updated.get('name')}' updated. "
+        f"Added {tags_added_count} tag(s), {studio_msg}."
+    )
+    exit_plugin(
+        msg=f"Updated group {updated.get('id')} '{updated.get('name')}' from {target_url}"
+    )
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/plugins/GroupAutoScraper/manifest b/plugins/GroupAutoScraper/manifest
new file mode 100644
index 00000000..bc568842
--- /dev/null
+++ b/plugins/GroupAutoScraper/manifest
@@ -0,0 +1,13 @@
+id: GroupAutoScraper
+name: GroupAutoScraper
+metadata:
+  description: Automatically re-scrape groups that have an Adult Empire URL to pickup tags and studio for the group.
+version: 1.1.0
+date: "2026-03-13 00:00:00"
+requires: []
+source_repository: https://stashapp.github.io/CommunityScripts/stable/index.yml
+files:
+  - GroupAutoScraper.yml
+  - autoScraper.py
+  - README.md
+