-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathllm_service.py
More file actions
1542 lines (1390 loc) · 94.1 KB
/
llm_service.py
File metadata and controls
1542 lines (1390 loc) · 94.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
LLM Service for intelligent query understanding
- Converts natural language queries to structured search criteria
- Provides semantic understanding of user intent
- Supports multiple LLM backends
"""
import json
import re
import asyncio
import os
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
import httpx
# Optional Gemini import
try:
import google.generativeai as genai
# Check version and required attributes
try:
# Try to get version info
version_info = None
if hasattr(genai, '__version__'):
version_info = genai.__version__
elif hasattr(genai, 'version'):
version_info = getattr(genai.version, '__version__', 'unknown')
else:
try:
import pkg_resources
version_info = pkg_resources.get_distribution('google-generativeai').version
except:
version_info = 'unknown'
if version_info:
print(f"🧠 google-generativeai version: {version_info}")
except Exception as e:
print(f"🧠 Could not determine google-generativeai version: {e}")
# Check if GenerativeModel is available (version check)
if hasattr(genai, 'GenerativeModel'):
GEMINI_AVAILABLE = True
print(f"✅ google-generativeai has GenerativeModel attribute")
else:
print(f"❌ google-generativeai installed but GenerativeModel not available.")
print(f" This usually means the package version is too old (< 0.2.0)")
available_attrs = [attr for attr in dir(genai) if not attr.startswith('_')]
print(f" Available attributes (first 15): {available_attrs[:15]}")
print(f" Solution:")
print(f" 1. Upgrade pip: pip install --upgrade pip")
print(f" 2. Upgrade package: pip install --upgrade --force-reinstall google-generativeai")
print(f" 3. If that fails, check Python version (need >= 3.8): python --version")
print(f" 4. Try: pip install --upgrade pip setuptools wheel")
print(f" Current version 0.1.0rc1 is too old - need >= 0.3.0 for GenerativeModel")
GEMINI_AVAILABLE = False
genai = None
except ImportError as e:
print(f"⚠️ google-generativeai not installed: {e}")
print(f" Install with: pip install google-generativeai")
GEMINI_AVAILABLE = False
genai = None
# Shared instructions so the LLM resolves synonyms and normalization (reduces need for code injections)
LLM_SPECIES_AND_NORMALIZATION_RULES = """
SPECIES SYNONYMS – use the EXACT value from available_filters:
- "rabbit", "rabbits", "cottontail", "cottontails", "white cottontail", "white cottontails" → use the species in available_filters that contains "cottontail" (e.g. eastern_cottontail). Pick that exact filter value; do not use "rabbit" or "white cottontail" as the species value.
- "crow"/"crows" → use "american_crow" if it appears in available_filters.
- NEVER put action words in species. Words like "eating", "feeding", "walking", "standing", "running" are ACTIONS only → put them in "action", never in "species". Example: "rabbit eating" → species: [eastern_cottontail], action: [foraging]; NOT species: [eating].
TIME NORMALIZATION – use canonical values that match available_filters:
- "night time", "nighttime", "at night", "during night", "Nighttime" → time: ["night"] (use "night" if present in available times, otherwise the value that means night).
- "day time", "daytime", "during day" → time: ["day"]
- "dawn", "sunrise" → time: ["dawn"]
- "dusk", "sunset" → time: ["dusk"]
- "evening", "twilight", "late afternoon" → time: ["evening"] (use "evening" if present in available times).
"""
@dataclass
class QueryUnderstanding:
"""Structured understanding of a natural language query"""
intent: str # What the user wants to find
entities: List[str] # Key entities mentioned
filters: Dict[str, Any] # Structured filters
confidence: float # Confidence in the understanding
reasoning: str # Why this understanding was chosen
description_query: Optional[str] = None # Phrase in the style of image descriptions; used to match/rank by description field
class LLMService:
"""Service for LLM-powered query understanding - supports Azure OpenAI, OpenAI, and Gemini"""
def __init__(
self,
api_key: str = None,
model: str = "gpt-5-mini-2",
provider: str = "auto",
azure_endpoint: str = None,
azure_api_key: str = None,
azure_deployment: str = None,
azure_api_version: str = None,
):
self.api_key = api_key
self.model = model
self.provider = provider # "openai", "gemini", or "auto"
# Initialize OpenAI/Azure settings
self.openai_api_key = api_key
self.openai_model = model
self.openai_available = False
self.is_azure = False
self.is_project_key = False
# Azure OpenAI (from args or env)
self.azure_endpoint = (azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT", "")).strip().rstrip("/")
self.azure_api_key = (azure_api_key or os.getenv("AZURE_OPENAI_API_KEY", "")).strip()
if not self.azure_api_key and api_key:
self.azure_api_key = api_key # fallback to OPENAI_API_KEY for Azure key
self.azure_deployment = (azure_deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT") or model or "gpt-5-mini-2").strip()
self.azure_api_version = (azure_api_version or os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview")).strip()
# Prefer Azure when endpoint and key are set
if self.azure_endpoint and self.azure_api_key:
self.is_azure = True
self.base_url = f"{self.azure_endpoint}/openai/deployments/{self.azure_deployment}/chat/completions?api-version={self.azure_api_version}"
self.openai_available = True
print(f"🧠 Azure OpenAI: endpoint={self.azure_endpoint[:50]}..., deployment={self.azure_deployment}")
elif api_key and api_key.startswith("sk-proj-"):
self.base_url = "https://api.openai.com/v1/chat/completions"
self.is_project_key = True
self.openai_available = True
print(f"🧠 OpenAI: Project API key detected (length: {len(api_key)})")
elif api_key and api_key.startswith("sk-"):
self.base_url = "https://api.openai.com/v1/chat/completions"
self.openai_available = True
print(f"🧠 OpenAI: Personal API key detected (length: {len(api_key)})")
else:
self.base_url = "https://api.openai.com/v1/chat/completions"
print(f"🧠 OpenAI: No API key provided (set OPENAI_API_KEY or AZURE_OPENAI_ENDPOINT+AZURE_OPENAI_API_KEY)")
# Initialize Gemini settings
self.gemini_api_key = os.getenv("GOOGLE_API_KEY")
self.gemini_model = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
self.gemini_available = False
self.gemini_model_obj = None
# Debug: Print environment variable status
print(f"🧠 Gemini initialization check:")
print(f" GOOGLE_API_KEY from env: {'SET' if self.gemini_api_key else 'NOT SET'}")
if self.gemini_api_key:
print(f" GOOGLE_API_KEY length: {len(self.gemini_api_key)}")
print(f" GOOGLE_API_KEY starts with: {self.gemini_api_key[:10]}...")
# Check Gemini availability
if self.gemini_api_key:
# Try SDK first if available
if GEMINI_AVAILABLE and hasattr(genai, 'GenerativeModel'):
try:
print(f"🧠 Gemini: Attempting to configure SDK with API key (length: {len(self.gemini_api_key)})")
genai.configure(api_key=self.gemini_api_key)
print(f"🧠 Gemini: Creating model object: {self.gemini_model}")
self.gemini_model_obj = genai.GenerativeModel(self.gemini_model)
self.gemini_available = True
print(f"✅ Gemini: SDK initialized successfully, using {self.gemini_model}")
except Exception as e:
print(f"⚠️ Gemini SDK initialization failed: {e}")
print(f" Will use REST API instead (works with Python 3.8)")
self.gemini_available = True # Still available via REST API
self.gemini_model_obj = None
else:
# SDK not available, but we can use REST API
print(f"🧠 Gemini: SDK not available (GenerativeModel missing)")
print(f" Will use REST API instead (works with Python 3.8)")
self.gemini_available = True # Available via REST API
self.gemini_model_obj = None
elif GEMINI_AVAILABLE:
print(f"🧠 Gemini: SDK available but no GOOGLE_API_KEY set")
else:
print(f"🧠 Gemini: SDK not installed, but REST API can be used if GOOGLE_API_KEY is set")
# Determine which provider to use
if provider == "auto":
# Prefer Gemini if both are available (since OpenAI often has quota issues)
if self.gemini_available:
self.provider = "gemini"
print(f"🧠 Auto-selected: Gemini (both available, preferring Gemini)")
elif self.openai_available:
self.provider = "openai"
print(f"🧠 Auto-selected: OpenAI (Gemini not available)")
else:
self.provider = None
else:
self.provider = provider
if self.provider:
print(f"🧠 Using provider: {self.provider}")
print(f" OpenAI available: {self.openai_available}")
print(f" Gemini available: {self.gemini_available}")
else:
print(f"⚠️ No LLM provider available - will use metadata-based fallback")
# Define the search schema for the LLM
self.search_schema = {
"type": "object",
"properties": {
"intent": {
"type": "string",
"description": "What the user is looking for"
},
"entities": {
"type": "array",
"items": {"type": "string"},
"description": "Key entities mentioned in the query"
},
"filters": {
"type": "object",
"properties": {
"category": {
"type": "array",
"items": {"type": "string"},
"description": "Categories to search in (e.g., wildlife, plants, pests)"
},
"species": {
"type": "array",
"items": {"type": "string"},
"description": "Specific species mentioned"
},
"time": {
"type": "array",
"items": {"type": "string"},
"description": "Time of day (dawn, day, dusk, night)"
},
"season": {
"type": "array",
"items": {"type": "string"},
"description": "Season (spring, summer, fall, winter)"
},
"action": {
"type": "array",
"items": {"type": "string"},
"description": "Actions or behaviors (walking, eating, sleeping)"
},
"scene": {
"type": "array",
"items": {"type": "string"},
"description": "Environment or scene (forest, field, water)"
},
"weather": {
"type": "array",
"items": {"type": "string"},
"description": "Weather conditions"
},
"plant_state": {
"type": "array",
"items": {"type": "string"},
"description": "Plant/fruit state (ripe, unripe, blooming, fruiting, mixed)"
}
}
},
"description_query": {
"type": "string",
"description": "A short phrase (5-20 words) describing the desired image in the style of image descriptions, e.g. 'close-up of ripe raspberries on a bush with green leaves'. Used to match and rank results by the description field."
},
"confidence": {
"type": "number",
"description": "Confidence in the understanding (0-1)"
},
"reasoning": {
"type": "string",
"description": "Brief explanation of the understanding"
}
},
"required": ["intent", "entities", "filters", "confidence", "reasoning"]
}
async def understand_query(self, query: str, available_filters: Dict[str, List[str]] = None) -> QueryUnderstanding:
"""Convert natural language query to structured understanding"""
# Try providers in order: OpenAI -> Gemini -> Metadata-based
print(f"🧠 understand_query called:")
print(f" Query: '{query}'")
print(f" Provider: {self.provider}")
print(f" OpenAI available: {self.openai_available}")
print(f" Gemini available: {self.gemini_available}")
print(f" Gemini model object: {self.gemini_model_obj is not None}")
# Try OpenAI first if provider is explicitly "openai"
# (but skip if provider is "auto" and Gemini is available, since we prefer Gemini)
openai_should_try = (self.provider == "openai") or (self.provider == "auto" and self.openai_available and not self.gemini_available)
if openai_should_try:
try:
print(f"🧠 Attempting OpenAI understanding...")
result = await self._openai_understanding(query, available_filters)
print(f"✅ OpenAI understanding successful")
return result
except Exception as e:
error_str = str(e).lower()
# Check if it's a quota/billing error
if "quota" in error_str or "429" in error_str or "insufficient" in error_str:
print(f"⚠️ OpenAI quota exceeded - trying Gemini...")
# Fall through to try Gemini
else:
print(f"❌ OpenAI error: {e} - trying Gemini...")
# Fall through to try Gemini
# Try Gemini if:
# 1. Provider is "gemini"
# 2. Provider is "auto" and Gemini is available (preferred over OpenAI)
# 3. OpenAI was tried but failed
gemini_should_try = (self.provider == "gemini") or (self.provider == "auto" and self.gemini_available) or (openai_should_try and self.gemini_available)
if gemini_should_try and self.gemini_available:
try:
print(f"🧠 Attempting Gemini understanding...")
print(f" Provider: {self.provider}")
print(f" Gemini model: {self.gemini_model}")
print(f" Gemini API key present: {bool(self.gemini_api_key)}")
print(f" Gemini model object: {self.gemini_model_obj is not None}")
# If model object is None, _gemini_understanding will automatically use REST API
result = await self._gemini_understanding(query, available_filters)
print(f"✅ Gemini understanding successful (confidence: {result.confidence})")
return result
except Exception as e:
print(f"❌ Gemini error: {e} - falling back to metadata-based...")
import traceback
traceback.print_exc()
# Fall through to metadata-based
else:
print(f"⚠️ Gemini not being tried:")
print(f" - gemini_should_try: {gemini_should_try}")
print(f" - gemini_available: {self.gemini_available}")
print(f" - provider: {self.provider}")
print(f" - GEMINI_AVAILABLE: {GEMINI_AVAILABLE}")
print(f" - gemini_api_key set: {bool(self.gemini_api_key)}")
print(f" - gemini_model_obj initialized: {self.gemini_model_obj is not None}")
# Fall back to metadata-based understanding (LLM errored or returned invalid response)
print("⚠️ Using metadata-based query understanding (LLM unavailable or returned invalid response)")
print(f" Available filters: {list(available_filters.keys()) if available_filters else 'none'}")
if available_filters and "species" in available_filters:
print(f" Available species: {available_filters['species'][:10]}...") # Show first 10
return self._metadata_based_understanding(query, available_filters)
async def _openai_understanding(self, query: str, available_filters: Dict[str, List[str]] = None) -> QueryUnderstanding:
"""Use OpenAI API for query understanding"""
system_prompt = f"""You are an expert at understanding natural language queries about agricultural and wildlife datasets.
Your task is to convert user queries into structured search criteria.
CRITICAL FIRST STEP: Categorize the query immediately as one of:
- "pest" (insects, diseases, harmful organisms)
- "animal" or "wildlife" (mammals, birds, livestock)
- "plant" (crops, fruits, vegetables, vegetation)
This categorization helps narrow the search space and improves performance. Always set the "category" filter first.
Available filter options:
{json.dumps(available_filters, indent=2) if available_filters else "All filters available"}
Return a JSON response matching this schema:
{json.dumps(self.search_schema, indent=2)}
Examples:
- "bobcat" → {{"intent": "find bobcat images", "entities": ["bobcat"], "filters": {{"species": ["bobcat"]}}, "confidence": 0.9, "reasoning": "Direct species mention"}}
- "bobcat at night" → {{"intent": "find bobcat images at night", "entities": ["bobcat", "night"], "filters": {{"species": ["bobcat"], "time": ["night"]}}, "confidence": 0.95, "reasoning": "Species and time specification"}}
- "coyote looking at the camera" → {{"intent": "find coyote images looking at camera", "entities": ["coyote", "looking at camera"], "filters": {{"species": ["coyote"], "action": ["alert"]}}, "confidence": 0.9, "reasoning": "Species and action specification - looking at camera maps to alert"}}
- "crows" → {{"intent": "find crow images", "entities": ["crow"], "filters": {{"species": ["american_crow"]}}, "confidence": 0.9, "reasoning": "Crow query maps to american_crow species"}}
- "rabbit eating" → {{"intent": "find rabbit images eating", "entities": ["rabbit", "eating"], "filters": {{"species": ["eastern_cottontail"], "action": ["foraging"]}}, "confidence": 0.95, "reasoning": "Rabbit/cottontail synonym: use eastern_cottontail from filters. 'Eating' is action only → foraging"}}
- "white cottontail" → {{"intent": "find white cottontail images", "entities": ["white cottontail"], "filters": {{"species": ["eastern_cottontail"]}}, "confidence": 0.95, "reasoning": "White cottontail maps to eastern_cottontail (same as rabbit) from available filters"}}
- "horse at night" or "horse at night time" → {{"intent": "find horse images at night", "entities": ["horse", "night"], "filters": {{"species": ["horse"], "time": ["night"]}}, "confidence": 0.95, "reasoning": "Species and time; use canonical time 'night' not 'Nighttime'"}}
- "animals in summer forest" → {{"intent": "find wildlife in summer forest environment", "entities": ["animals", "summer", "forest"], "filters": {{"category": ["wildlife"], "season": ["summer"], "scene": ["forest"]}}, "confidence": 0.8, "reasoning": "Combined environmental and seasonal criteria"}}
- "goats in the field" → {{"intent": "find goat images in field environment", "entities": ["goat", "field"], "filters": {{"species": ["goat"], "scene": ["field"]}}, "confidence": 0.9, "reasoning": "Species and scene specification - 'in the field' indicates field scene"}}
- "predators hunting at dawn" → {{"intent": "find hunting predators at dawn", "entities": ["predators", "hunting", "dawn"], "filters": {{"action": ["hunting"], "time": ["dawn"]}}, "confidence": 0.85, "reasoning": "Behavior and time specification"}}
- "raspberry ripe" → {{"intent": "find ripe raspberry images", "entities": ["raspberry", "ripe"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "description_query": "close-up of ripe raspberries on a bush with green leaves", "confidence": 0.95, "reasoning": "Species and ripeness specification - 'ripe' maps to plant_state, not action"}}
- "raspberry red" → {{"intent": "find red/ripe raspberry images", "entities": ["raspberry", "red"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "description_query": "close-up of red ripe raspberries and green foliage", "confidence": 0.9, "reasoning": "Species and color specification - 'red' for berries indicates ripe, maps to plant_state"}}
DESCRIPTION_QUERY (important for ranking):
- Always provide "description_query" when the user describes what they want to see. It should be a short phrase (5-20 words) in the style of image descriptions in our dataset.
- Style: start with "close-up of" or "image of", include the subject (e.g. raspberry, bobcat), key visual traits (ripe, red, green leaves, at night), and setting if relevant.
- Examples: "close-up of ripe raspberries on a bush with green leaves", "bobcat at night in forest", "coyote looking at camera in field".
- This phrase is matched against each image's description field to rank results so the best-matching images appear first.
IMPORTANT:
{LLM_SPECIES_AND_NORMALIZATION_RULES}
- Extract species names EXACTLY as they appear in available filters (e.g., "bobcat", "coyote", "crow", "american_crow", "strawberry", "raspberry", "eastern_cottontail")
- "crow" or "crows" should map to "american_crow" if that's the available species name
- ONLY extract species that are in the available_filters list - if a species is not in available_filters, DO NOT extract it as a species filter
- If the query mentions a species that is NOT in available_filters, return an empty species filter and explain in reasoning that the species is not available
- Map action keywords to canonical action names:
* "feeding", "eating" → "foraging"
* "sleeping", "resting" → "sleeping" or "resting"
* "looking at camera", "looking at the camera", "staring at camera", "facing camera", "looking toward camera" → "alert"
* "walking", "moving" → "walking" or "moving"
- Extract scene keywords from phrases like "in the field" → scene: ["field"], "in forest" → scene: ["forest"], "in garden" → scene: ["garden"]
- Scene keywords: field, forest, water, mountain, garden, farm, meadow, indoor, outdoor
- For PLANT/FRUIT queries (raspberry, strawberry, etc.), map ripeness/color descriptors to plant_state, NOT action:
* "ripe", "red", "mature" → plant_state: ["ripe"] (for fruits/berries)
* "unripe", "green", "immature" → plant_state: ["unripe"] (for fruits/berries)
* "raspberry ripe" → species: ["raspberry"], plant_state: ["ripe"] (NOT action!)
* "raspberry red" → species: ["raspberry"], plant_state: ["ripe"] (red = ripe for berries)
- Only use filters that are explicitly mentioned in the query
- Do NOT infer or add filters that are not mentioned
- Species names must match exactly (case-insensitive) with available filter values
Query: "{query}"
"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": query}
]
# Prepare headers: Azure uses api-key, OpenAI uses Authorization Bearer
headers = {"Content-Type": "application/json"}
if self.is_azure:
headers["api-key"] = self.azure_api_key
print(f"🧠 Using Azure OpenAI (api-key) authentication")
elif self.is_project_key:
headers["Authorization"] = f"Bearer {self.api_key}"
print(f"🧠 Using project API key authentication")
else:
headers["Authorization"] = f"Bearer {self.api_key}"
print(f"🧠 Using personal API key authentication")
# Azure uses deployment in URL; OpenAI expects "model" in body. Both accept "messages", etc.
body = {
"messages": messages,
"temperature": 0.1,
"max_tokens": 500
}
if not self.is_azure:
body["model"] = self.model
try:
async with httpx.AsyncClient() as client:
response = await client.post(
self.base_url,
headers=headers,
json=body,
timeout=30.0
)
print(f"🧠 OpenAI API response status: {response.status_code}")
if response.status_code == 200:
result = response.json()
content = result["choices"][0]["message"]["content"]
try:
# Parse the JSON response
parsed = json.loads(content)
parsed.setdefault("description_query", None)
understanding = QueryUnderstanding(**parsed)
# Validate that we got real LLM understanding (not empty)
if not understanding.filters and not understanding.entities:
raise ValueError("LLM returned empty understanding")
return understanding
except Exception as e:
print(f"❌ Failed to parse LLM response: {e}")
print(f"🧠 Raw response content: {content}")
raise ValueError(f"Failed to parse LLM response: {e}")
else:
print(f"❌ OpenAI API error: {response.status_code}")
print(f"🧠 Error response: {response.text}")
raise ValueError(f"OpenAI API error: {response.status_code} - {response.text}")
except Exception as e:
print(f"❌ OpenAI API request failed: {e}")
raise
async def _gemini_understanding(self, query: str, available_filters: Dict[str, List[str]] = None) -> QueryUnderstanding:
"""Use Google Gemini API for query understanding"""
if not self.gemini_api_key:
raise ValueError("GOOGLE_API_KEY not set")
# Try using the REST API directly if GenerativeModel is not available
if not GEMINI_AVAILABLE or not hasattr(genai, 'GenerativeModel'):
# Fallback to REST API
return await self._gemini_rest_api_understanding(query, available_filters)
if not self.gemini_model_obj:
# Try to reinitialize if model object is None
print(f"⚠️ Gemini model object is None, attempting to reinitialize...")
print(f" API key present: {bool(self.gemini_api_key)}")
print(f" Model name: {self.gemini_model}")
try:
if not self.gemini_api_key:
raise ValueError("GOOGLE_API_KEY not set - cannot reinitialize")
genai.configure(api_key=self.gemini_api_key)
self.gemini_model_obj = genai.GenerativeModel(self.gemini_model)
self.gemini_available = True
print(f"✅ Gemini model object reinitialized successfully")
except Exception as e:
print(f"❌ Failed to reinitialize Gemini model: {e}")
print(f" Falling back to REST API...")
# Fallback to REST API
return await self._gemini_rest_api_understanding(query, available_filters)
system_prompt = f"""You are an expert at understanding natural language queries about agricultural and wildlife datasets.
Your task is to convert user queries into structured search criteria.
Available filter options:
{json.dumps(available_filters, indent=2) if available_filters else "All filters available"}
Return a JSON response matching this schema:
{json.dumps(self.search_schema, indent=2)}
Examples:
- "bobcat" → {{"intent": "find bobcat images", "entities": ["bobcat"], "filters": {{"species": ["bobcat"]}}, "confidence": 0.9, "reasoning": "Direct species mention"}}
- "bobcat at night" → {{"intent": "find bobcat images at night", "entities": ["bobcat", "night"], "filters": {{"species": ["bobcat"], "time": ["night"]}}, "confidence": 0.95, "reasoning": "Species and time specification"}}
- "coyote looking at the camera" → {{"intent": "find coyote images looking at camera", "entities": ["coyote", "looking at camera"], "filters": {{"species": ["coyote"], "action": ["alert"]}}, "confidence": 0.9, "reasoning": "Species and action specification - looking at camera maps to alert"}}
- "crows" → {{"intent": "find crow images", "entities": ["crow"], "filters": {{"species": ["american_crow"]}}, "confidence": 0.9, "reasoning": "Crow query maps to american_crow species"}}
- "rabbit eating" → {{"intent": "find rabbit images eating", "entities": ["rabbit", "eating"], "filters": {{"species": ["eastern_cottontail"], "action": ["foraging"]}}, "confidence": 0.95, "reasoning": "Rabbit/cottontail synonym: use eastern_cottontail from filters. 'Eating' is action only → foraging"}}
- "white cottontail" → {{"intent": "find white cottontail images", "entities": ["white cottontail"], "filters": {{"species": ["eastern_cottontail"]}}, "confidence": 0.95, "reasoning": "White cottontail maps to eastern_cottontail (same as rabbit) from available filters"}}
- "horse at night" or "horse at night time" → {{"intent": "find horse images at night", "entities": ["horse", "night"], "filters": {{"species": ["horse"], "time": ["night"]}}, "confidence": 0.95, "reasoning": "Species and time; use canonical time 'night' not 'Nighttime'"}}
- "animals in summer forest" → {{"intent": "find wildlife in summer forest environment", "entities": ["animals", "summer", "forest"], "filters": {{"category": ["wildlife"], "season": ["summer"], "scene": ["forest"]}}, "confidence": 0.8, "reasoning": "Combined environmental and seasonal criteria"}}
- "goats in the field" → {{"intent": "find goat images in field environment", "entities": ["goat", "field"], "filters": {{"species": ["goat"], "scene": ["field"]}}, "confidence": 0.9, "reasoning": "Species and scene specification - 'in the field' indicates field scene"}}
- "predators hunting at dawn" → {{"intent": "find hunting predators at dawn", "entities": ["predators", "hunting", "dawn"], "filters": {{"action": ["hunting"], "time": ["dawn"]}}, "confidence": 0.85, "reasoning": "Behavior and time specification"}}
- "raspberry ripe" → {{"intent": "find ripe raspberry images", "entities": ["raspberry", "ripe"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "confidence": 0.95, "reasoning": "Species and ripeness specification - 'ripe' maps to plant_state, not action"}}
- "raspberry red" → {{"intent": "find red/ripe raspberry images", "entities": ["raspberry", "red"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "confidence": 0.9, "reasoning": "Species and color specification - 'red' for berries indicates ripe, maps to plant_state"}}
- "raspberries that can be eaten" → {{"intent": "find edible/ripe raspberry images", "entities": ["raspberry", "edible"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "confidence": 0.95, "reasoning": "Species and edibility specification - 'can be eaten' for fruits/berries means ripe, maps to plant_state"}}
- "edible raspberries" → {{"intent": "find edible/ripe raspberry images", "entities": ["raspberry", "edible"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "confidence": 0.95, "reasoning": "Species and edibility specification - 'edible' for fruits/berries means ripe, maps to plant_state"}}
- "red raspberry" → {{"intent": "find red/ripe raspberry images", "entities": ["raspberry", "red"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "description_query": "close-up of red ripe raspberries and green foliage", "confidence": 0.9, "reasoning": "Species and color specification - 'red' for berries indicates ripe, maps to plant_state"}}
DESCRIPTION_QUERY: Always provide "description_query" when the user describes what they want to see: a short phrase (5-20 words) in the style of image descriptions (e.g. "close-up of ripe raspberries on a bush with green leaves", "bobcat at night in forest"). Used to match and rank results by the image description field.
IMPORTANT:
{LLM_SPECIES_AND_NORMALIZATION_RULES}
- Extract species names EXACTLY as they appear in available filters (e.g., "bobcat", "coyote", "crow", "american_crow", "strawberry", "raspberry", "eastern_cottontail")
- "crow" or "crows" should map to "american_crow" if that's the available species name
- ONLY extract species that are in the available_filters list - if a species is not in available_filters, DO NOT extract it as a species filter
- If the query mentions a species that is NOT in available_filters, return an empty species filter and explain in reasoning that the species is not available
- For PLANT/FRUIT queries (raspberry, strawberry, etc.), map ripeness/color/edibility descriptors to plant_state, NOT action:
* "ripe", "red", "mature" → plant_state: ["ripe"] (for fruits/berries)
* "unripe", "green", "immature" → plant_state: ["unripe"] (for fruits/berries)
* "edible", "can be eaten", "ready to eat", "ready for eating" → plant_state: ["ripe"] (for fruits/berries - edible means ripe)
* "raspberry ripe" → species: ["raspberry"], plant_state: ["ripe"] (NOT action!)
* "raspberry red" → species: ["raspberry"], plant_state: ["ripe"] (red = ripe for berries)
* "raspberries that can be eaten" → species: ["raspberry"], plant_state: ["ripe"] (edible = ripe for fruits)
* "edible raspberries" → species: ["raspberry"], plant_state: ["ripe"] (edible = ripe for fruits)
- For edibility queries about fruits/berries, you MUST infer plant_state: ["ripe"] - this is a semantic mapping, not arbitrary inference
- Only use filters that are explicitly mentioned in the query OR semantically implied (edibility → ripeness for fruits)
- Species names must match exactly (case-insensitive) with available filter values
- Return ONLY valid JSON, no markdown formatting or code blocks
Query: "{query}"
"""
full_prompt = f"{system_prompt}\n\nUser query: {query}\n\nReturn the JSON response:"
try:
# Use Gemini to generate response (run in thread since it's sync)
def call_gemini():
return self.gemini_model_obj.generate_content(
full_prompt,
generation_config={
"temperature": 0.1,
"max_output_tokens": 500,
}
)
loop = asyncio.get_event_loop()
response = await loop.run_in_executor(None, call_gemini)
content = response.text.strip()
# Remove markdown code blocks if present
if content.startswith("```json"):
content = content[7:]
if content.startswith("```"):
content = content[3:]
if content.endswith("```"):
content = content[:-3]
content = content.strip()
# Try to extract JSON if response contains extra text
# Look for JSON object boundaries
json_start = content.find('{')
json_end = content.rfind('}') + 1
if json_start >= 0 and json_end > json_start:
content = content[json_start:json_end]
# Parse the JSON response
try:
parsed = json.loads(content)
except json.JSONDecodeError as json_err:
# Try to fix common JSON issues
# Remove trailing commas before closing braces/brackets
import re
content_fixed = re.sub(r',(\s*[}\]])', r'\1', content)
try:
parsed = json.loads(content_fixed)
except:
# Last resort: show more context for debugging
print(f"🧠 Failed to parse JSON. Content (first 500 chars): {content[:500]}")
raise json_err
parsed.setdefault("description_query", None)
understanding = QueryUnderstanding(**parsed)
# Validate that we got real LLM understanding
if not understanding.filters and not understanding.entities:
raise ValueError("Gemini returned empty understanding")
print(f"🧠 Gemini understanding successful (confidence: {understanding.confidence})")
return understanding
except json.JSONDecodeError as e:
print(f"❌ Failed to parse Gemini response: {e}")
print(f"🧠 Raw response content: {content[:200]}...")
raise ValueError(f"Failed to parse Gemini response: {e}")
except Exception as e:
print(f"❌ Gemini API request failed: {e}")
raise
async def _gemini_rest_api_understanding(self, query: str, available_filters: Dict[str, List[str]] = None) -> QueryUnderstanding:
"""Use Gemini REST API directly (works with Python 3.8 and old SDK versions)"""
system_prompt = f"""You are an expert at understanding natural language queries about agricultural and wildlife datasets.
Your task is to convert user queries into structured search criteria.
Available filter options:
{json.dumps(available_filters, indent=2) if available_filters else "All filters available"}
Return a JSON response matching this schema:
{json.dumps(self.search_schema, indent=2)}
Examples:
- "bobcat" → {{"intent": "find bobcat images", "entities": ["bobcat"], "filters": {{"species": ["bobcat"]}}, "confidence": 0.9, "reasoning": "Direct species mention"}}
- "bobcat at night" → {{"intent": "find bobcat images at night", "entities": ["bobcat", "night"], "filters": {{"species": ["bobcat"], "time": ["night"]}}, "confidence": 0.95, "reasoning": "Species and time specification"}}
- "pigs feeding" → {{"intent": "find pig images feeding", "entities": ["pig", "feeding"], "filters": {{"species": ["pig"], "action": ["foraging"]}}, "confidence": 0.9, "reasoning": "Species and action specification - feeding maps to foraging"}}
- "coyote looking at the camera" → {{"intent": "find coyote images looking at camera", "entities": ["coyote", "looking at camera"], "filters": {{"species": ["coyote"], "action": ["alert"]}}, "confidence": 0.9, "reasoning": "Species and action specification - looking at camera maps to alert"}}
- "raspberry ripe" → {{"intent": "find ripe raspberry images", "entities": ["raspberry", "ripe"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "confidence": 0.95, "reasoning": "Species and ripeness specification - 'ripe' maps to plant_state, not action"}}
- "raspberry red" → {{"intent": "find red/ripe raspberry images", "entities": ["raspberry", "red"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "confidence": 0.9, "reasoning": "Species and color specification - 'red' for berries indicates ripe, maps to plant_state"}}
- "raspberries that can be eaten" → {{"intent": "find edible/ripe raspberry images", "entities": ["raspberry", "edible"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "confidence": 0.95, "reasoning": "Species and edibility specification - 'can be eaten' for fruits/berries means ripe, maps to plant_state"}}
- "edible raspberries" → {{"intent": "find edible/ripe raspberry images", "entities": ["raspberry", "edible"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "confidence": 0.95, "reasoning": "Species and edibility specification - 'edible' for fruits/berries means ripe, maps to plant_state"}}
- "red raspberry" → {{"intent": "find red/ripe raspberry images", "entities": ["raspberry", "red"], "filters": {{"species": ["raspberry"], "plant_state": ["ripe"]}}, "description_query": "close-up of red ripe raspberries and green foliage", "confidence": 0.9, "reasoning": "Species and color specification - 'red' for berries indicates ripe, maps to plant_state"}}
DESCRIPTION_QUERY: Always provide "description_query" when the user describes what they want to see: a short phrase (5-20 words) in the style of image descriptions (e.g. "close-up of ripe raspberries on a bush with green leaves", "bobcat at night in forest"). Used to match and rank results by the image description field.
IMPORTANT:
{LLM_SPECIES_AND_NORMALIZATION_RULES}
- Extract species names EXACTLY as they appear in available filters (e.g., "bobcat", "coyote", "crow", "american_crow", "strawberry", "raspberry", "eastern_cottontail")
- "crow" or "crows" should map to "american_crow" if that's the available species name
- ONLY extract species that are in the available_filters list - if a species is not in available_filters, DO NOT extract it as a species filter
- If the query mentions a species that is NOT in available_filters, return an empty species filter and explain in reasoning that the species is not available
- Map action keywords to canonical action names:
* "feeding", "eating" → "foraging"
* "sleeping", "resting" → "sleeping" or "resting"
* "looking at camera", "looking at the camera", "staring at camera", "facing camera", "looking toward camera" → "alert"
* "walking", "moving" → "walking" or "moving"
- Extract scene keywords from phrases like "in the field" → scene: ["field"], "in forest" → scene: ["forest"], "in garden" → scene: ["garden"]
- Scene keywords: field, forest, water, mountain, garden, farm, meadow, indoor, outdoor
- For PLANT/FRUIT queries (raspberry, strawberry, etc.), map ripeness/color/edibility descriptors to plant_state, NOT action:
* "ripe", "red", "mature" → plant_state: ["ripe"] (for fruits/berries)
* "unripe", "green", "immature" → plant_state: ["unripe"] (for fruits/berries)
* "edible", "can be eaten", "ready to eat", "ready for eating" → plant_state: ["ripe"] (for fruits/berries - edible means ripe)
* "raspberry ripe" → species: ["raspberry"], plant_state: ["ripe"] (NOT action!)
* "raspberry red" → species: ["raspberry"], plant_state: ["ripe"] (red = ripe for berries)
* "raspberries that can be eaten" → species: ["raspberry"], plant_state: ["ripe"] (edible = ripe for fruits)
* "edible raspberries" → species: ["raspberry"], plant_state: ["ripe"] (edible = ripe for fruits)
- For edibility queries about fruits/berries, you MUST infer plant_state: ["ripe"] - this is a semantic mapping, not arbitrary inference
- Only use filters that are explicitly mentioned in the query OR semantically implied (edibility → ripeness for fruits)
- Species names must match exactly (case-insensitive) with available filter values
- Return ONLY valid JSON, no markdown formatting or code blocks
Query: "{query}"
"""
full_prompt = f"{system_prompt}\n\nUser query: {query}\n\nReturn the JSON response:"
# Use Gemini REST API v1
url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.gemini_model}:generateContent"
headers = {
"Content-Type": "application/json",
}
params = {
"key": self.gemini_api_key
}
payload = {
"contents": [{
"parts": [{
"text": full_prompt
}]
}],
"generationConfig": {
"temperature": 0.1,
"maxOutputTokens": 500,
}
}
try:
async with httpx.AsyncClient() as client:
response = await client.post(url, headers=headers, params=params, json=payload, timeout=30.0)
if response.status_code != 200:
error_text = response.text
print(f"❌ Gemini REST API error: {response.status_code}")
print(f" Response: {error_text}")
raise ValueError(f"Gemini API error: {response.status_code} - {error_text}")
result = response.json()
# Extract text from response
if "candidates" in result and len(result["candidates"]) > 0:
candidate = result["candidates"][0]
if "content" in candidate and "parts" in candidate["content"]:
parts = candidate["content"]["parts"]
if len(parts) > 0 and "text" in parts[0]:
content = parts[0]["text"].strip()
else:
raise ValueError("No text in Gemini response")
else:
raise ValueError("No content in Gemini response candidate")
else:
raise ValueError("No candidates in Gemini response")
# Remove markdown code blocks if present
if content.startswith("```json"):
content = content[7:]
if content.startswith("```"):
content = content[3:]
if content.endswith("```"):
content = content[:-3]
content = content.strip()
# Try to extract JSON if response contains extra text
# Look for JSON object boundaries
json_start = content.find('{')
json_end = content.rfind('}') + 1
if json_start >= 0 and json_end > json_start:
content = content[json_start:json_end]
# Parse the JSON response
try:
parsed = json.loads(content)
except json.JSONDecodeError as json_err:
# Try to fix common JSON issues
import re
content_fixed = re.sub(r',(\s*[}\]])', r'\1', content)
parsed = None
try:
parsed = json.loads(content_fixed)
except json.JSONDecodeError:
# Try repairing unterminated string (Gemini sometimes truncates)
if "Unterminated" in str(json_err) and "string" in str(json_err):
last_quote = content_fixed.rfind('"')
if last_quote > 0:
repair = content_fixed[:last_quote + 1] + '"}'
try:
parsed = json.loads(repair)
except json.JSONDecodeError:
pass
if parsed is None:
print(f"🧠 Failed to parse JSON. Content (first 500 chars): {content[:500]}")
raise json_err
parsed.setdefault("description_query", None)
understanding = QueryUnderstanding(**parsed)
# Validate that we got real LLM understanding
if not understanding.filters and not understanding.entities:
raise ValueError("Gemini returned empty understanding")
print(f"🧠 Gemini REST API understanding successful (confidence: {understanding.confidence})")
return understanding
except json.JSONDecodeError as e:
print(f"❌ Failed to parse Gemini REST API response: {e}")
print(f"🧠 Raw response content: {content[:200] if 'content' in locals() else 'N/A'}...")
raise ValueError(f"Failed to parse Gemini response: {e}")
except Exception as e:
print(f"❌ Gemini REST API request failed: {e}")
import traceback
traceback.print_exc()
raise
def _metadata_based_understanding(self, query: str, available_filters: Dict[str, List[str]] = None) -> QueryUnderstanding:
"""Metadata-based query understanding using actual MCP metadata values"""
import re
query_lower = query.lower()
entities = []
filters = {
"category": [],
"species": [],
"time": [],
"season": [],
"action": [],
"scene": [],
"weather": [],
"plant_state": []
}
if not available_filters:
available_filters = {}
# Match query against actual available filter values from MCP metadata
# This ensures we only match what actually exists in the data
# Match species from available filters (MCP uses "species" key)
# Species are extracted from both "species" field and "collection" field
species_to_check = []
if "species" in available_filters:
species_to_check.extend(available_filters["species"])
print(f" 🔍 Available species from filters: {available_filters['species'][:10]}...") # Show first 10
# Also check if collections are available (they often contain species names)
if "collections" in available_filters:
print(f" 🔍 Available collections: {available_filters['collections'][:10]}...") # Show first 10
for coll in available_filters["collections"]:
# Extract base species name from collection (e.g., "bobcat_001" -> "bobcat", "Red_fox" -> "red")
coll_base = coll.split("_")[0].split("-")[0].strip().lower()
# Normalize: remove underscores, convert to lowercase
coll_normalized = coll_base.replace("_", "").replace("-", "").lower()
if coll_normalized not in [s.lower().replace("_", "").replace("-", "") for s in species_to_check]:
species_to_check.append(coll_normalized)
print(f" 🔍 Total species to check: {len(species_to_check)}")
print(f" 🔍 Query: '{query_lower}'")
# Common species names to check even if not in available_filters
# This helps when the species exists in data but wasn't extracted to filters
# Include both singular and plural forms, and variations
# Rabbit, cottontail, and white cottontail all map to eastern_cottontail (never white_cottontail as filter value)
# Pest type words (beetle, butterfly, wasp, etc.) match via common_names so "show me beetles" finds pest images
common_species = [
"bobcat", "bobcats",
"coyote", "coyotes",
"deer",
"fox", "foxes", "red_fox", "redfox", "red_foxes",
"crow", "crows", "american_crow", "american_crows",
"strawberry", "strawberries",
"raspberry", "raspberries",
"chicken", "chickens",
"goat", "goats",
"carrot", "carrots",
"pig", "pigs",
"rabbit", "rabbits", "cottontail", "cottontails", "eastern_cottontail",
"white_cottontail", "white_cottontails", # query trigger only; resolved to eastern_cottontail
"opossum", "opossums", "oppossum", "oppossums", "virginia_opossum",
# Pest types (matched via common_names in MCP data)
"beetle", "beetles", "butterfly", "butterflies", "moth", "moths",
"wasp", "wasps", "bee", "bees", "ant", "ants", "fly", "flies",
"grasshopper", "grasshoppers", "dragonfly", "dragonflies",
"spider", "spiders", "stink bug", "stink bugs", "true bug", "bugs", "insect", "insects",
]
# Add common species to check if they're in the query
# Handle pluralization and variations
query_normalized = query_lower.replace("_", "").replace("-", "")
for common in common_species:
common_normalized = common.replace("_", "").replace("-", "").lower()
# Check if the normalized common species is in the normalized query
# Also check if query contains the base word (e.g., "strawberries" contains "strawberry")
base_word = common_normalized.rstrip('s') # Remove trailing 's' for plural matching
if (common_normalized in query_normalized or
(len(base_word) >= 4 and base_word in query_normalized) or
query_normalized in common_normalized):
# Check if it's not already in species_to_check
already_in_list = any(
s.replace("_", "").replace("-", "").lower() == common_normalized
for s in species_to_check
)
if not already_in_list:
species_to_check.append(common)
print(f" 🔍 Added common species to check: {common} (matched from query)")
# Match species in query - try multiple matching strategies
# Important: We need to match the query to actual filter values in available_filters
species_matched = False
query_normalized = query_lower.replace("_", "").replace("-", "").replace(" ", "")
query_word_count = len(query_lower.split())
# EXACT MATCH FIRST: For single-word queries, if query (or singular form) is exactly a species/dataset name, use it.
# This guarantees "carrots" -> "carrot" (crop) and never "carrot seed moth" (pest), even with 3500+ species in the list.
if "species" in available_filters and query_word_count <= 1:
query_stem = query_lower.strip()
candidates = [query_stem]
if query_stem.endswith("s") and len(query_stem) > 1:
candidates.append(query_stem[:-1]) # carrots -> carrot
for c in candidates:
for s in available_filters["species"]:
if s.lower().strip() == c:
entities.append(s)
filters["species"].append(s)
print(f" ✅ Matched species (exact): '{s}' from query '{query_lower}'")
species_matched = True
break
if species_matched:
break
# Words that are actions, not species - never match these as species (e.g. "eating" in "rabbit eating")
action_words = {
"eating", "feeding", "foraging", "standing", "walking", "running", "sitting", "sleeping",
"resting", "moving", "alert", "hunting", "perching", "flying", "blooming", "fruiting",
"growing", "mature", "stretching", "reaching", "consuming", "lowering", "facing", "engaging",
}
# First, try to match against actual available filter values (unless we already exact-matched)
# Single-word query: prefer SHORT/one-word species. Multi-word: prefer more words/longest first.
if "species" in available_filters and not species_matched:
def sort_key(s):
word_count = s.count('_') + s.count('-') + s.count(' ') + 1
if query_word_count <= 1:
# Prefer fewer words, then shorter: "carrot" before "carrot seed moth"
return (word_count, len(s), s.lower())
# Prefer more words, then longest: "red_leaf" before "red"
return (-word_count, -len(s), s.lower())
sorted_species = sorted(available_filters["species"], key=sort_key)
for filter_species in sorted_species:
filter_species_lower = filter_species.lower().strip()
# Skip filter values that are action verbs (e.g. "eating" in "rabbit eating")
if filter_species_lower in action_words:
continue
filter_species_normalized = filter_species_lower.replace("_", "").replace("-", "").replace(" ", "")
# Replace underscores/hyphens with spaces for word boundary matching
species_words = filter_species_lower.replace("_", " ").replace("-", " ")
# Strategy 1: Handle plurals FIRST with word boundaries (e.g., "raspberries" -> "raspberry")
# This needs to come first so plurals are normalized before simple word matches
base_filter = filter_species_normalized.rstrip('s')
if len(base_filter) >= 4: # Only check plurals for words with 4+ chars (avoid "red" -> "re")
base_species_words = species_words.rstrip('s').rstrip(' ')
if base_species_words and len(base_species_words) >= 4:
# Match plural forms: handle both regular (s) and irregular (ies, es) plurals
# Pattern: match "raspberry", "raspberrys", "raspberries", "raspberryes"
# Use word boundary at start, allow 's', 'ies', 'es' at end, then word boundary
escaped_base = re.escape(base_species_words)
# Match: word boundary + base + (s|ies|es) + word boundary
plural_patterns = [
r'\b' + escaped_base + r'(?:ies|es|s)\b', # Plural forms
r'\b' + escaped_base + r'\b', # Singular form
]
for pattern in plural_patterns:
if re.search(pattern, query_lower):
# Store the canonical (singular) form from the filter
entities.append(filter_species)
filters["species"].append(filter_species)
print(f" ✅ Matched species from available filters (plural): {filter_species} from query '{query_lower}' (pattern: {pattern})")
species_matched = True
break
if species_matched:
break
# Strategy 2: Exact word boundary matching (handles "raspberry" in "red raspberries", "red leaf" → "red_leaf")
# But skip very short words (3 chars or less) if there are longer potential matches
# This prevents "red" from matching in "red raspberries" before "raspberry" is checked
if len(filter_species_normalized) >= 4 or len(sorted_species) == 1:
pattern = r'\b' + re.escape(species_words) + r'\b'
match = re.search(pattern, query_lower)
if match:
# Prefer exact matches: if query matches species_words exactly, use it
# This ensures "red leaf" matches "red_leaf" rather than "red_leaf_lettuce"
matched_text = match.group()
# Check if this is an exact match (all words in species_words are in query)
species_word_list = species_words.split()
if len(species_word_list) == 1 or all(word in query_lower for word in species_word_list):
entities.append(filter_species)
filters["species"].append(filter_species)
print(f" ✅ Matched species from available filters (word boundary): {filter_species} from query '{query_lower}'")
species_matched = True
break
# Strategy 3: For very short species names (3 chars), only match if it's the ONLY word in query
# or if it's a compound like "red_fox" or "red_leaf"
if len(filter_species_normalized) <= 3:
# Only match short names if they're compound (contain underscore) or if query is just that word
if "_" in filter_species or filter_species_lower == query_lower.strip():
pattern = r'\b' + re.escape(species_words) + r'\b'
if re.search(pattern, query_lower):
entities.append(filter_species)
filters["species"].append(filter_species)
print(f" ✅ Matched species from available filters (short/compound): {filter_species} from query '{query_lower}'")
species_matched = True
break
# Strategy 3: Normalized matching (fallback for compound names like "red_fox")
# Only use if the species is longer than 4 chars to avoid matching "red" in "red raspberries"
# When query is single-word, do NOT match when query is only a substring of a multi-word species
# (e.g. "carrots" -> "carrot" crop, not "carrot seed moth" pest)
if len(filter_species_normalized) >= 5:
filter_word_count = filter_species.count('_') + filter_species.count('-') + filter_species.count(' ') + 1
query_is_single = query_word_count <= 1
if filter_species_normalized in query_normalized:
pass # query contains full species, ok
elif query_normalized in filter_species_normalized:
if query_is_single and filter_word_count > 1:
continue # skip: single-word query must not match multi-word species as substring
else:
continue
entities.append(filter_species)
filters["species"].append(filter_species)
print(f" ✅ Matched species from available filters (normalized): {filter_species} from query")
species_matched = True
break
# If no match in available filters, try common species list