-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Add graphrag-storage. #2127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: v3/main
Are you sure you want to change the base?
Add graphrag-storage. #2127
Changes from all commits
0e19173
7e41278
4b7e5bc
9b05924
bb7e367
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| # GraphRAG Storage | ||
|
|
||
| ## Basic | ||
|
|
||
| ```python | ||
| import asyncio | ||
| from graphrag_storage import StorageConfig, create_storage | ||
| from graphrag_storage.file_storage import FileStorage | ||
|
|
||
| async def run(): | ||
| storage = create_storage( | ||
| StorageConfig( | ||
| type="FileStorage", # or FileStorage.__name__ | ||
| base_dir="output" | ||
| ) | ||
| ) | ||
|
|
||
| await storage.set("my_key", "value") | ||
| print(await storage.get("my_key")) | ||
|
|
||
| if __name__ == "__main__": | ||
| asyncio.run(run()) | ||
| ``` | ||
|
|
||
| ## Custom Storage | ||
|
|
||
| ```python | ||
| import asyncio | ||
| from typing import Any | ||
| from graphrag_storage import Storage, StorageConfig, create_storage, register_storage | ||
|
|
||
| class MyStorage(Storage): | ||
| def __init__(self, some_setting: str, **kwargs: Any): | ||
| # Validate settings and initialize | ||
| ... | ||
|
|
||
| #Implement rest of interface | ||
| ... | ||
|
|
||
| register_storage("MyStorage", MyStorage) | ||
|
|
||
| async def run(): | ||
| storage = create_storage( | ||
| StorageConfig( | ||
| type="MyStorage" | ||
| some_setting="My Setting" | ||
| ) | ||
| ) | ||
| # Or use the factory directly to instantiate with a dict instead of using | ||
| # StorageConfig + create_factory | ||
| # from graphrag_storage.storage_factory import storage_factory | ||
| # storage = storage_factory.create(strategy="MyStorage", init_args={"some_setting": "My Setting"}) | ||
|
|
||
| await storage.set("my_key", "value") | ||
| print(await storage.get("my_key")) | ||
|
|
||
| if __name__ == "__main__": | ||
| asyncio.run(run()) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| # Copyright (c) 2024 Microsoft Corporation. | ||
| # Licensed under the MIT License | ||
|
|
||
| """The GraphRAG Storage package.""" | ||
|
|
||
| from graphrag_storage.storage import Storage | ||
| from graphrag_storage.storage_config import StorageConfig | ||
| from graphrag_storage.storage_factory import create_storage, register_storage | ||
|
|
||
| __all__ = [ | ||
| "Storage", | ||
| "StorageConfig", | ||
| "create_storage", | ||
| "register_storage", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,7 @@ | ||
| # Copyright (c) 2024 Microsoft Corporation. | ||
| # Licensed under the MIT License | ||
|
|
||
| """Azure Blob Storage implementation of PipelineStorage.""" | ||
| """Azure Blob Storage implementation of Storage.""" | ||
|
|
||
| import logging | ||
| import re | ||
|
|
@@ -12,53 +12,65 @@ | |
| from azure.identity import DefaultAzureCredential | ||
| from azure.storage.blob import BlobServiceClient | ||
|
|
||
| from graphrag.storage.pipeline_storage import ( | ||
| PipelineStorage, | ||
| from graphrag_storage.storage import ( | ||
| Storage, | ||
| get_timestamp_formatted_with_local_tz, | ||
| ) | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class BlobPipelineStorage(PipelineStorage): | ||
| class AzureBlobStorage(Storage): | ||
| """The Blob-Storage implementation.""" | ||
|
|
||
| _connection_string: str | None | ||
| _container_name: str | ||
| _base_dir: str | None | ||
| _encoding: str | ||
| _storage_account_blob_url: str | None | ||
| _blob_service_client: BlobServiceClient | ||
| _storage_account_name: str | None | ||
|
|
||
| def __init__(self, **kwargs: Any) -> None: | ||
| def __init__( | ||
| self, | ||
| base_dir: str | None = None, | ||
| connection_string: str | None = None, | ||
| storage_account_blob_url: str | None = None, | ||
| container_name: str | None = None, | ||
| encoding: str = "utf-8", | ||
| **kwargs: Any, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Presumably the kwargs in these constructors are just to soak additional unknown args from the factory? My preference is definitely to avoid kwargs.get and instead explicitly name it – would it therefore make sense register lambdas with the factory that pluck exact args and perform this soaking function? It's a bit more code, but could set a reasonable pattern of explicit mapping from the factory init call to instances? |
||
| ) -> None: | ||
| """Create a new BlobStorage instance.""" | ||
| connection_string = kwargs.get("connection_string") | ||
| storage_account_blob_url = kwargs.get("storage_account_blob_url") | ||
| base_dir = kwargs.get("base_dir") | ||
| container_name = kwargs["container_name"] | ||
| if container_name is None: | ||
| msg = "No container name provided for blob storage." | ||
| raise ValueError(msg) | ||
| if connection_string is None and storage_account_blob_url is None: | ||
| msg = "No storage account blob url provided for blob storage." | ||
| msg = "AzureBlobStorage requires either a connection_string or storage_account_blob_url to be specified." | ||
| logger.error(msg) | ||
| raise ValueError(msg) | ||
|
|
||
| if connection_string is not None and storage_account_blob_url is not None: | ||
| msg = "AzureBlobStorage requires only one of connection_string or storage_account_blob_url to be specified, not both." | ||
| logger.error(msg) | ||
| raise ValueError(msg) | ||
|
|
||
| if container_name is None: | ||
| msg = "AzureBlobStorage requires a container_name to be specified." | ||
| logger.error(msg) | ||
| raise ValueError(msg) | ||
|
|
||
| _validate_blob_container_name(container_name) | ||
|
|
||
| logger.info( | ||
| "Creating blob storage at [%s] and base_dir [%s]", container_name, base_dir | ||
| ) | ||
| if connection_string: | ||
| self._blob_service_client = BlobServiceClient.from_connection_string( | ||
| connection_string | ||
| ) | ||
| else: | ||
| if storage_account_blob_url is None: | ||
| msg = "Either connection_string or storage_account_blob_url must be provided." | ||
| raise ValueError(msg) | ||
|
|
||
| elif storage_account_blob_url: | ||
| self._blob_service_client = BlobServiceClient( | ||
| account_url=storage_account_blob_url, | ||
| credential=DefaultAzureCredential(), | ||
| ) | ||
| self._encoding = kwargs.get("encoding", "utf-8") | ||
| self._encoding = encoding | ||
| self._container_name = container_name | ||
| self._connection_string = connection_string | ||
| self._base_dir = base_dir | ||
|
|
@@ -208,12 +220,12 @@ async def delete(self, key: str) -> None: | |
| async def clear(self) -> None: | ||
| """Clear the cache.""" | ||
|
|
||
| def child(self, name: str | None) -> "PipelineStorage": | ||
| def child(self, name: str | None) -> "Storage": | ||
| """Create a child storage instance.""" | ||
| if name is None: | ||
| return self | ||
| path = str(Path(self._base_dir) / name) if self._base_dir else name | ||
| return BlobPipelineStorage( | ||
| return AzureBlobStorage( | ||
| connection_string=self._connection_string, | ||
| container_name=self._container_name, | ||
| encoding=self._encoding, | ||
|
|
@@ -245,7 +257,7 @@ async def get_creation_date(self, key: str) -> str: | |
| return "" | ||
|
|
||
|
|
||
| def validate_blob_container_name(container_name: str): | ||
| def _validate_blob_container_name(container_name: str) -> None: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could all of these checks be collapsed into a single regex? |
||
| """ | ||
| Check if the provided blob container name is valid based on Azure rules. | ||
|
|
||
|
|
@@ -267,32 +279,25 @@ def validate_blob_container_name(container_name: str): | |
| """ | ||
| # Check the length of the name | ||
| if len(container_name) < 3 or len(container_name) > 63: | ||
| return ValueError( | ||
| f"Container name must be between 3 and 63 characters in length. Name provided was {len(container_name)} characters long." | ||
| ) | ||
| msg = f"Container name must be between 3 and 63 characters in length. Name provided was {len(container_name)} characters long." | ||
| raise ValueError(msg) | ||
|
|
||
| # Check if the name starts with a letter or number | ||
| if not container_name[0].isalnum(): | ||
| return ValueError( | ||
| f"Container name must start with a letter or number. Starting character was {container_name[0]}." | ||
| ) | ||
| msg = f"Container name must start with a letter or number. Starting character was {container_name[0]}." | ||
| raise ValueError(msg) | ||
|
|
||
| # Check for valid characters (letters, numbers, hyphen) and lowercase letters | ||
| if not re.match(r"^[a-z0-9-]+$", container_name): | ||
| return ValueError( | ||
| f"Container name must only contain:\n- lowercase letters\n- numbers\n- or hyphens\nName provided was {container_name}." | ||
| ) | ||
| msg = f"Container name must only contain:\n- lowercase letters\n- numbers\n- or hyphens\nName provided was {container_name}." | ||
| raise ValueError(msg) | ||
|
|
||
| # Check for consecutive hyphens | ||
| if "--" in container_name: | ||
| return ValueError( | ||
| f"Container name cannot contain consecutive hyphens. Name provided was {container_name}." | ||
| ) | ||
| msg = f"Container name cannot contain consecutive hyphens. Name provided was {container_name}." | ||
| raise ValueError(msg) | ||
|
|
||
| # Check for hyphens at the end of the name | ||
| if container_name[-1] == "-": | ||
| return ValueError( | ||
| f"Container name cannot end with a hyphen. Name provided was {container_name}." | ||
| ) | ||
|
|
||
| return True | ||
| msg = f"Container name cannot end with a hyphen. Name provided was {container_name}." | ||
| raise ValueError(msg) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm iffy on this change. It breaks with current convention, and feels "off" for how you would put values in yaml. I understand the intent from factory registration perspective of course. Maybe worth tagging a couple of other folks to see if there is consensus one way or the other