diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..de4b714 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +__pycache__ +venv +package +build diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml new file mode 100644 index 0000000..4a7e014 --- /dev/null +++ b/.github/workflows/pytest.yml @@ -0,0 +1,56 @@ +name: Run Pytest + +on: + pull_request: + branches: + - master + +jobs: + test: + runs-on: ubuntu-latest + env: + ENV: dev + LOGLEVEL: DEBUG + WANDB_DISABLED: true + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + OPENAI_ORGANIZATION: ${{ secrets.OPENAI_ORGANIZATION }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OPENAI_API_TYPE: open_ai + OPENAI_API_BASE_URL: https://api.openai.com/v1 + OPENAI_API_VERSION: ${{ secrets.OPENAI_API_VERSION }} + HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }} + PALM_KEY: ${{ secrets.PALM_KEY }} + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: "3.x" + + - name: Install Docker Compose + run: sudo apt-get install docker-compose + + - name: Start services + run: docker-compose up -d + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install coverage + pip install pytest + + - name: Run tests with coverage + run: | + coverage run -m pytest -vv --log-cli-level=ERROR ./tests/ + + - name: Generate coverage report + run: coverage report + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v2 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 42b4c10..910089a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,6 +5,19 @@ repos: - id: black args: [--line-length=120] + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.1.12 + hooks: + - id: insert-license + name: "Insert license header in C++ source files" + args: + [ + --license-filepath=assets/header.txt, + "--comment-style=#", + --detect-license-in-X-top-lines=2, + ] + types_or: [python, makefile, dockerfile] + - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.1.0 hooks: diff --git a/.pypirc b/.pypirc new file mode 100644 index 0000000..9bf6fd7 --- /dev/null +++ b/.pypirc @@ -0,0 +1,14 @@ +[distutils] +index-servers = + pypi + local + +[pypi] + username = __token__ + password = pypi-AgEIcHlwaS5vcmcCJDBjMjQ0MWMyLWZlNjYtNGJkYS1iMTQyLTUwYTVhODM1NTkyZAACKlszLCIwOGU4ZGFjYS1jZTJlLTQzNGYtYTFkMi03ZGRlNDBmZmJmZTgiXQAABiDllKewzbF_OAnOrY1yuMdEG6yTLvrIVJrma5SNz0cgRA + + +[local] +repository: https://pypi.setu.co/ +username: infra +password: c2RramN2Ymd3ZHljdmFzcXEK diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6b66220 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +FROM nvidia/cuda:12.2.0-devel-ubuntu20.04 AS base + +WORKDIR /app + +ENV DEBIAN_FRONTEND=noninteractive +RUN useradd --create-home genius + +RUN apt-get update \ + && apt-get install -y software-properties-common build-essential curl wget vim git libpq-dev pkg-config \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y python3.10 python3.10-dev python3.10-distutils \ + && apt-get clean +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \ + && python3.10 get-pip.py + +RUN apt-get update && apt-get install -y git && apt-get clean + +RUN pip install --no-cache-dir torch +RUN pip install --no-cache-dir jupyterlab +RUN pip install --no-cache-dir transformers +RUN pip install --no-cache-dir datasets +RUN pip install --no-cache-dir diffusers +RUN pip install --no-cache-dir --upgrade geniusrise + +ENV AWS_DEFAULT_REGION=ap-south-1 +ENV AWS_SECRET_ACCESS_KEY= +ENV AWS_ACCESS_KEY_ID= +ENV HUGGINGFACE_ACCESS_TOKEN= +ENV GENIUS=/home/genius/.local/bin/genius + +COPY --chown=genius:genius . /app/ + +RUN pip3.10 install --no-cache-dir --use-deprecated=legacy-resolver -r requirements.txt +RUN pip install --no-cache-dir numpy==1.26.3 +USER genius + +CMD ["genius", "--help"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6ca24c7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,51 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and +You must cause any modified files to carry prominent notices stating that You changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/README.md b/README.md index d6ae9e7..4a04d30 100644 --- a/README.md +++ b/README.md @@ -1,78 +1,52 @@ -![banner](./assets/banner.jpg) - - - - -# Huggingface Bolts - -This is a collection of generic streaming and (micro) batch bolts interfacing -with the huggingface ecosystem. - -**Table of Contents** - -- [Huggingface Bolts](#huggingface-bolts) - - [Usage](#usage) - - [Usage](#usage-1) - - [Text Classification](#text-classification) - - - -Includes: - -| No. | Name | Description | Input Type | Output Type | -| --- | ----------------------------------------------------- | ---------------------------------------------- | ---------- | ----------- | -| 1 | [Text Classification](#text-classification) | Fine-tuning for text classification tasks | Batch | Batch | -| 2 | [Instruction Tuning](#instruction-tuning) | Fine-tuning for instruction tuning tasks | Batch | Batch | -| 3 | [Commonsense Reasoning](#commonsense-reasoning) | Fine-tuning for commonsense reasoning tasks | Batch | Batch | -| 4 | [Language Modeling](#language-modeling) | Fine-tuning for language modeling tasks | Batch | Batch | -| 5 | [Named Entity Recognition](#named-entity-recognition) | Fine-tuning for named entity recognition tasks | Batch | Batch | -| 6 | [Question Answering](#question-answering) | Fine-tuning for question answering tasks | Batch | Batch | -| 7 | [Sentiment Analysis](#sentiment-analysis) | Fine-tuning for sentiment analysis tasks | Batch | Batch | -| 8 | [Summarization](#summarization) | Fine-tuning for summarization tasks | Batch | Batch | -| 9 | [Translation](#translation) | Fine-tuning for translation tasks | Batch | Batch | - -## Usage - -To test, first bring up all related services via the supplied docker-compose: - -```bash -docker compose up -d -docker compose logs -f -``` - -These management consoles will be available: - -| Console | Link | -| -------- | ---------------------- | -| Kafka UI | http://localhost:8088/ | - -Postgres can be accessed with: - -```bash -docker exec -it geniusrise-postgres-1 psql -U postgres -``` - -## Usage - -### Text Classification - -To fine-tune a model for text classification tasks, you can use the following -command: - -```bash -genius HuggingFaceClassificationFineTuner rise \ - batch \ - --input_folder my_dataset \ - streaming \ - --output_kafka_topic my_topic \ - --output_kafka_cluster_connection_string localhost:9094 \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - load_dataset \ - --args -``` +![logo_with_text](https://github.com/geniusrise/.github/assets/144122/2f8e51ee-0fcd-4f74-90fd-97301ef7943d) + +### AI Ecosystem + +

+ Documentation + || + Examples + || + Cloud +

+ +### About + +Geniusrise is a modular, loosely-coupled +MLOps framework designed for the era of Large Language Models, +offering flexibility and standardization in designing networks of +AI agents. + +It defines components and orchestrates them providing observability, state management and data handling, +all while supporting diverse infrastructures. With its modular and unopinonated architecture, +Geniusrise empowers teams to build, share, +and deploy AI across various platforms. + +Geniusrise is powered by its components: + +- [geniusrise-text](https://github.com/geniusrise/geniusrise-text): Text components offerring: + - Inference APIs + - Bulk inference + - Fine-tuning +- [geniusrise-audio](https://github.com/geniusrise/geniusrise-audio): Audio components offerring: + - Inference APIs + - Bulk inference + - Fine-tuning +- [geniusrise-vision](https://github.com/geniusrise/geniusrise-vision): Vision components offerring: + - Inference APIs + - Bulk inference + - Fine-tuning +- [geniusrise-listeners](https://github.com/geniusrise/geniusrise-listeners): Streaming data ingestion +- [geniusrise-databases](https://github.com/geniusrise/geniusrise-databases): Bulk data ingestion + +### Links + +- **Website**: [geniusrise.ai](https://geniusrise.ai) +- **Docs**: [docs.geniusrise.ai](https://docs.geniusrise.ai) +- **Examples**: [geniusrise/examples](https://github.com/geniusrise/examples) +- **Cloud**: [geniusrise.com](https://geniusrise.com) + +# Text Components + +These are text components, mainly focused around models of the text modality (both input and output). +This also includes large language models. diff --git a/assets/header.txt b/assets/header.txt new file mode 100644 index 0000000..3895d2e --- /dev/null +++ b/assets/header.txt @@ -0,0 +1,14 @@ +🧠 Geniusrise +Copyright (C) 2023 geniusrise.ai + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/assets/logo_with_text.png b/assets/logo_with_text.png new file mode 100644 index 0000000..e1c890d Binary files /dev/null and b/assets/logo_with_text.png differ diff --git a/data/chat/chat.csv b/data/chat/chat.csv new file mode 100644 index 0000000..96266f1 --- /dev/null +++ b/data/chat/chat.csv @@ -0,0 +1,3 @@ +instruction +<|system|>\n\n<|user|>\nHow do I sort a list in Haskell?\n<|assistant|> +<|system|>\n\n<|user|>\nHow do I sort a list in Python?\n<|assistant|> diff --git a/data/lm/lm.csv b/data/lm/lm.csv new file mode 100644 index 0000000..01825e5 --- /dev/null +++ b/data/lm/lm.csv @@ -0,0 +1,2 @@ +text +<|system|>\n\n<|user|>\nHow do I sort a list in Python?\n<|assistant|> diff --git a/data/ner/ner.csv b/data/ner/ner.csv new file mode 100644 index 0000000..f101da3 --- /dev/null +++ b/data/ner/ner.csv @@ -0,0 +1,4 @@ +text +"My name is Clara and I live in Berkeley, California." +"My name is Clara and I live in Berkeley, California and has sever back pain and liver cirrhosis." +"My name is Clara and I live in Berkeley, California and i deal with sulfuric acid." diff --git a/data/nli/nli.csv b/data/nli/nli.csv new file mode 100644 index 0000000..bfbef40 --- /dev/null +++ b/data/nli/nli.csv @@ -0,0 +1,4 @@ +premise,hypothesis +"This a very good entry level smartphone, battery last 2-3 days after fully charged when connected to the internet. No memory lag issue when playing simple hidden object games. Performance is beyond my expectation, i bought it with a good bargain, couldnt ask for more!","the phone has an awesome battery life" +"This a very good entry level smartphone, battery last 2-3 days after fully charged when connected to the internet. No memory lag issue when playing simple hidden object games. Performance is beyond my expectation, i bought it with a good bargain, couldnt ask for more!","the phone has an awesome battery life" +"This a very good entry level smartphone, battery last 2-3 days after fully charged when connected to the internet. No memory lag issue when playing simple hidden object games. Performance is beyond my expectation, i bought it with a good bargain, couldnt ask for more!","the phone has an awesome battery life" diff --git a/data/qa-table/qa-table.csv b/data/qa-table/qa-table.csv new file mode 100644 index 0000000..32e0a80 --- /dev/null +++ b/data/qa-table/qa-table.csv @@ -0,0 +1,3 @@ +"data","question" +"[{""Product"": ""Apple"", ""Price"": ""1""},{""Product"": ""Banana"", ""Price"": ""0.5""},{""Product"": ""Orange"", ""Price"": ""1.2""}]","What is the total price of all products?" +"[{""Product"": ""Apple"", ""Price"": ""1""},{""Product"": ""Banana"", ""Price"": ""0.5""},{""Product"": ""Orange"", ""Price"": ""1.2""}]","What is the total price of all products?" diff --git a/data/qa-traditional/qa-traditional.csv b/data/qa-traditional/qa-traditional.csv new file mode 100644 index 0000000..e44ae4b --- /dev/null +++ b/data/qa-traditional/qa-traditional.csv @@ -0,0 +1,3 @@ +data,question +"Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me.","What is the common wisdom about RNNs?" +"Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me.","What is the common wisdom about RNNs?" diff --git a/data/summz/summz.csv b/data/summz/summz.csv new file mode 100644 index 0000000..cc02d96 --- /dev/null +++ b/data/summz/summz.csv @@ -0,0 +1,4 @@ +text +"Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me." +"Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me." +"Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me." diff --git a/data/trans/trans.csv b/data/trans/trans.csv new file mode 100644 index 0000000..8e1716b --- /dev/null +++ b/data/trans/trans.csv @@ -0,0 +1,4 @@ +hi_IN +"संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है" +"संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है" +"संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है" diff --git a/data/txtclass/txtclass.csv b/data/txtclass/txtclass.csv new file mode 100644 index 0000000..cf2abbc --- /dev/null +++ b/data/txtclass/txtclass.csv @@ -0,0 +1,2 @@ +text +"although gabriels quarterly results exceeded expectations, few were willing to buy because of lack of liquidity" diff --git a/geniusrise_text/__init__.py b/geniusrise_text/__init__.py new file mode 100644 index 0000000..cfd6aa0 --- /dev/null +++ b/geniusrise_text/__init__.py @@ -0,0 +1,25 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .classification import TextClassificationAPI, TextClassificationBulk, TextClassificationFineTuner +from .embeddings import EmbeddingsAPI, EmbeddingsBulk +from .instruction import InstructionAPI, InstructionBulk, InstructionFineTuner +from .language_model import LanguageModelAPI, LanguageModelBulk, LanguageModelFineTuner +from .ner import NamedEntityRecognitionAPI, NamedEntityRecognitionBulk, NamedEntityRecognitionFineTuner +from .nli import NLIAPI, NLIBulk, NLIFineTuner +from .qa import QAAPI, QABulk, QAFineTuner +from .summarization import SummarizationAPI, SummarizationBulk, SummarizationFineTuner +from .translation import TranslationAPI, TranslationBulk, TranslationFineTuner +from .notebook import TextJupyterNotebook diff --git a/geniusrise_text/base/__init__.py b/geniusrise_text/base/__init__.py new file mode 100644 index 0000000..4e1e036 --- /dev/null +++ b/geniusrise_text/base/__init__.py @@ -0,0 +1,18 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import TextAPI +from .bulk import TextBulk +from .fine_tune import TextFineTuner diff --git a/geniusrise_text/base/api.py b/geniusrise_text/base/api.py new file mode 100644 index 0000000..73c9225 --- /dev/null +++ b/geniusrise_text/base/api.py @@ -0,0 +1,496 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import threading +from typing import Any, Dict, Optional, Union, List + +import llama_cpp +import cherrypy +from geniusrise import BatchInput, BatchOutput, State +from geniusrise.logging import setup_logger + +from .bulk import TextBulk + +# Define a global lock for sequential access control +sequential_lock = threading.Lock() + + +class TextAPI(TextBulk): + """ + A class representing a Hugging Face API for generating text using a pre-trained language model. + + Attributes: + model (Any): The pre-trained language model. + tokenizer (Any): The tokenizer used to preprocess input text. + model_name (str): The name of the pre-trained language model. + model_revision (Optional[str]): The revision of the pre-trained language model. + tokenizer_name (str): The name of the tokenizer used to preprocess input text. + tokenizer_revision (Optional[str]): The revision of the tokenizer used to preprocess input text. + model_class (str): The name of the class of the pre-trained language model. + tokenizer_class (str): The name of the class of the tokenizer used to preprocess input text. + use_cuda (bool): Whether to use a GPU for inference. + quantization (int): The level of quantization to use for the pre-trained language model. + precision (str): The precision to use for the pre-trained language model. + device_map (str | Dict | None): The mapping of devices to use for inference. + max_memory (Dict[int, str]): The maximum memory to use for inference. + torchscript (bool): Whether to use a TorchScript-optimized version of the pre-trained language model. + model_args (Any): Additional arguments to pass to the pre-trained language model. + + Methods: + text(**kwargs: Any) -> Dict[str, Any]: + Generates text based on the given prompt and decoding strategy. + + listen(model_name: str, model_class: str = "AutoModelForCausalLM", tokenizer_class: str = "AutoTokenizer", use_cuda: bool = False, precision: str = "float16", quantization: int = 0, device_map: str | Dict | None = "auto", max_memory={0: "24GB"}, torchscript: bool = True, endpoint: str = "*", port: int = 3000, cors_domain: str = "http://localhost:3000", username: Optional[str] = None, password: Optional[str] = None, **model_args: Any) -> None: + Starts a CherryPy server to listen for requests to generate text. + """ + + model: Any + tokenizer: Any + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + ): + """ + Initializes a new instance of the TextAPI class. + + Args: + input (BatchInput): The input data to process. + output (BatchOutput): The output data to process. + state (State): The state of the API. + """ + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def text(self, **kwargs: Any) -> Dict[str, Any]: + """ + Generates text based on the given prompt and decoding strategy. + + Args: + **kwargs (Any): Additional arguments to pass to the pre-trained language model. + + Returns: + Dict[str, Any]: A dictionary containing the prompt, arguments, and generated text. + """ + data = cherrypy.request.json + prompt = data.get("prompt") + decoding_strategy = data.get("decoding_strategy", "generate") + + max_new_tokens = data.get("max_new_tokens") + max_length = data.get("max_length") + temperature = data.get("temperature") + diversity_penalty = data.get("diversity_penalty") + num_beams = data.get("num_beams") + length_penalty = data.get("length_penalty") + early_stopping = data.get("early_stopping") + + others = data.__dict__ + + return { + "prompt": prompt, + "args": others, + "completion": self.generate( + prompt=prompt, + decoding_strategy=decoding_strategy, + max_new_tokens=max_new_tokens, + max_length=max_length, + temperature=temperature, + diversity_penalty=diversity_penalty, + num_beams=num_beams, + length_penalty=length_penalty, + early_stopping=early_stopping, + **others, + ), + } + + def validate_password(self, realm, username, password): + """ + Validate the username and password against expected values. + + Args: + realm (str): The authentication realm. + username (str): The provided username. + password (str): The provided password. + + Returns: + bool: True if credentials are valid, False otherwise. + """ + return username == self.username and password == self.password + + def listen( + self, + model_name: str, + # Huggingface params + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + concurrent_queries: bool = False, + use_vllm: bool = False, + use_llama_cpp: bool = False, + # VLLM params + vllm_tokenizer_mode: str = "auto", + vllm_download_dir: Optional[str] = None, + vllm_load_format: str = "auto", + vllm_seed: int = 42, + vllm_max_model_len: int = 1024, + vllm_enforce_eager: bool = False, + vllm_max_context_len_to_capture: int = 8192, + vllm_block_size: int = 16, + vllm_gpu_memory_utilization: float = 0.90, + vllm_swap_space: int = 4, + vllm_sliding_window: Optional[int] = None, + vllm_pipeline_parallel_size: int = 1, + vllm_tensor_parallel_size: int = 1, + vllm_worker_use_ray: bool = False, + vllm_max_parallel_loading_workers: Optional[int] = None, + vllm_disable_custom_all_reduce: bool = False, + vllm_max_num_batched_tokens: Optional[int] = None, + vllm_max_num_seqs: int = 64, + vllm_max_paddings: int = 512, + vllm_max_lora_rank: Optional[int] = None, + vllm_max_loras: Optional[int] = None, + vllm_max_cpu_loras: Optional[int] = None, + vllm_lora_extra_vocab_size: int = 0, + vllm_placement_group: Optional[dict] = None, + vllm_log_stats: bool = False, + # llama.cpp params + llama_cpp_filename: Optional[str] = None, + llama_cpp_n_gpu_layers: int = 0, + llama_cpp_split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER, + llama_cpp_tensor_split: Optional[List[float]] = None, + llama_cpp_vocab_only: bool = False, + llama_cpp_use_mmap: bool = True, + llama_cpp_use_mlock: bool = False, + llama_cpp_kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None, + llama_cpp_seed: int = llama_cpp.LLAMA_DEFAULT_SEED, + llama_cpp_n_ctx: int = 2048, + llama_cpp_n_batch: int = 512, + llama_cpp_n_threads: Optional[int] = None, + llama_cpp_n_threads_batch: Optional[int] = None, + llama_cpp_rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED, + llama_cpp_rope_freq_base: float = 0.0, + llama_cpp_rope_freq_scale: float = 0.0, + llama_cpp_yarn_ext_factor: float = -1.0, + llama_cpp_yarn_attn_factor: float = 1.0, + llama_cpp_yarn_beta_fast: float = 32.0, + llama_cpp_yarn_beta_slow: float = 1.0, + llama_cpp_yarn_orig_ctx: int = 0, + llama_cpp_mul_mat_q: bool = True, + llama_cpp_logits_all: bool = False, + llama_cpp_embedding: bool = False, + llama_cpp_offload_kqv: bool = True, + llama_cpp_last_n_tokens_size: int = 64, + llama_cpp_lora_base: Optional[str] = None, + llama_cpp_lora_scale: float = 1.0, + llama_cpp_lora_path: Optional[str] = None, + llama_cpp_numa: Union[bool, int] = False, + llama_cpp_chat_format: Optional[str] = None, + llama_cpp_draft_model: Optional[llama_cpp.LlamaDraftModel] = None, + # llama_cpp_tokenizer: Optional[PreTrainedTokenizerBase] = None, + llama_cpp_verbose: bool = True, + # Server params + endpoint: str = "*", + port: int = 3000, + cors_domain: str = "http://localhost:3000", + username: Optional[str] = None, + password: Optional[str] = None, + **model_args: Any, + ) -> None: + """ + Starts a CherryPy server to listen for requests to generate text. + + Args: + model_name (str): Name or identifier of the pre-trained model to be used. + model_class (str): Class name of the model to be used from the transformers library. + tokenizer_class (str): Class name of the tokenizer to be used from the transformers library. + use_cuda (bool): Flag to enable CUDA for GPU acceleration. + precision (str): Specifies the precision configuration for PyTorch tensors, e.g., "float16". + quantization (int): Level of model quantization to reduce model size and inference time. + device_map (Union[str, Dict, None]): Maps model layers to specific devices for distributed inference. + max_memory (Dict[int, str]): Maximum memory allocation for the model on each device. + torchscript (bool): Enables the use of TorchScript for model optimization. + compile (bool): Enables model compilation for further optimization. + awq_enabled (bool): Enables Adaptive Weight Quantization (AWQ) for model optimization. + flash_attention (bool): Utilizes Flash Attention optimizations for faster processing. + concurrent_queries (bool): Allows the server to handle multiple requests concurrently if True. + use_vllm (bool): Flag to use Very Large Language Models (VLLM) integration. + use_llama_cpp (bool): Flag to use llama.cpp integration for language model inference. + llama_cpp_filename (Optional[str]): The filename of the model file for llama.cpp. + llama_cpp_n_gpu_layers (int): Number of layers to offload to GPU in llama.cpp configuration. + llama_cpp_split_mode (int): Defines how the model is split across multiple GPUs in llama.cpp. + llama_cpp_tensor_split (Optional[List[float]]): Custom tensor split configuration for llama.cpp. + llama_cpp_vocab_only (bool): Loads only the vocabulary part of the model in llama.cpp. + llama_cpp_use_mmap (bool): Enables memory-mapped files for model loading in llama.cpp. + llama_cpp_use_mlock (bool): Locks the model in RAM to prevent swapping in llama.cpp. + llama_cpp_kv_overrides (Optional[Dict[str, Union[bool, int, float]]]): Key-value pairs for overriding default llama.cpp model parameters. + llama_cpp_seed (int): Seed for random number generation in llama.cpp. + llama_cpp_n_ctx (int): The number of context tokens for the model in llama.cpp. + llama_cpp_n_batch (int): Batch size for processing prompts in llama.cpp. + llama_cpp_n_threads (Optional[int]): Number of threads for generation in llama.cpp. + llama_cpp_n_threads_batch (Optional[int]): Number of threads for batch processing in llama.cpp. + llama_cpp_rope_scaling_type (Optional[int]): Specifies the RoPE (Rotary Positional Embeddings) scaling type in llama.cpp. + llama_cpp_rope_freq_base (float): Base frequency for RoPE in llama.cpp. + llama_cpp_rope_freq_scale (float): Frequency scaling factor for RoPE in llama.cpp. + llama_cpp_yarn_ext_factor (float): Extrapolation mix factor for YaRN in llama.cpp. + llama_cpp_yarn_attn_factor (float): Attention factor for YaRN in llama.cpp. + llama_cpp_yarn_beta_fast (float): Beta fast parameter for YaRN in llama.cpp. + llama_cpp_yarn_beta_slow (float): Beta slow parameter for YaRN in llama.cpp. + llama_cpp_yarn_orig_ctx (int): Original context size for YaRN in llama.cpp. + llama_cpp_mul_mat_q (bool): Flag to enable matrix multiplication for queries in llama.cpp. + llama_cpp_logits_all (bool): Returns logits for all tokens when set to True in llama.cpp. + llama_cpp_embedding (bool): Enables embedding mode only in llama.cpp. + llama_cpp_offload_kqv (bool): Offloads K, Q, V matrices to GPU in llama.cpp. + llama_cpp_last_n_tokens_size (int): Size for the last_n_tokens buffer in llama.cpp. + llama_cpp_lora_base (Optional[str]): Base model path for LoRA adjustments in llama.cpp. + llama_cpp_lora_scale (float): Scale factor for LoRA adjustments in llama.cpp. + llama_cpp_lora_path (Optional[str]): Path to LoRA adjustments file in llama.cpp. + llama_cpp_numa (Union[bool, int]): NUMA configuration for llama.cpp. + llama_cpp_chat_format (Optional[str]): Specifies the chat format for llama.cpp. + llama_cpp_draft_model (Optional[llama_cpp.LlamaDraftModel]): Draft model for speculative decoding in llama.cpp. + endpoint (str): Network interface to bind the server to. + port (int): Port number to listen on for incoming requests. + cors_domain (str): Specifies the domain to allow for Cross-Origin Resource Sharing (CORS). + username (Optional[str]): Username for basic authentication, if required. + password (Optional[str]): Password for basic authentication, if required. + **model_args (Any): Additional arguments to pass to the pre-trained language model or llama.cpp configuration. + """ + self.model_name = model_name + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.quantization = quantization + self.precision = precision + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.use_vllm = use_vllm + self.concurrent_queries = concurrent_queries + + self.model_args = model_args + self.username = username + self.password = password + + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.model_revision = model_revision + self.tokenizer_name = tokenizer_name + self.tokenizer_revision = tokenizer_revision + + if use_vllm: + self.model = self.load_models_vllm( + model=model_name, + tokenizer=tokenizer_name, + tokenizer_mode=vllm_tokenizer_mode, + trust_remote_code=True, + download_dir=vllm_download_dir, + load_format=vllm_load_format, + dtype=self._get_torch_dtype(precision), + seed=vllm_seed, + revision=model_revision, + tokenizer_revision=tokenizer_revision, + max_model_len=vllm_max_model_len, + quantization=(None if quantization == 0 else f"{quantization}-bit"), + enforce_eager=vllm_enforce_eager, + max_context_len_to_capture=vllm_max_context_len_to_capture, + block_size=vllm_block_size, + gpu_memory_utilization=vllm_gpu_memory_utilization, + swap_space=vllm_swap_space, + cache_dtype="auto", + sliding_window=vllm_sliding_window, + pipeline_parallel_size=vllm_pipeline_parallel_size, + tensor_parallel_size=vllm_tensor_parallel_size, + worker_use_ray=vllm_worker_use_ray, + max_parallel_loading_workers=vllm_max_parallel_loading_workers, + disable_custom_all_reduce=vllm_disable_custom_all_reduce, + max_num_batched_tokens=vllm_max_num_batched_tokens, + max_num_seqs=vllm_max_num_seqs, + max_paddings=vllm_max_paddings, + device="cuda" if use_cuda else "cpu", + max_lora_rank=vllm_max_lora_rank, + max_loras=vllm_max_loras, + max_cpu_loras=vllm_max_cpu_loras, + lora_dtype=self._get_torch_dtype(precision), + lora_extra_vocab_size=vllm_lora_extra_vocab_size, + placement_group=vllm_placement_group, # type: ignore + log_stats=vllm_log_stats, + batched_inference=False, + ) + elif use_llama_cpp: + self.model, self.tokenizer = self.load_models_llama_cpp( + model=self.model_name, + filename=llama_cpp_filename, + local_dir=self.output.output_folder, + n_gpu_layers=llama_cpp_n_gpu_layers, + split_mode=llama_cpp_split_mode, + main_gpu=0 if self.use_cuda else -1, + tensor_split=llama_cpp_tensor_split, + vocab_only=llama_cpp_vocab_only, + use_mmap=llama_cpp_use_mmap, + use_mlock=llama_cpp_use_mlock, + kv_overrides=llama_cpp_kv_overrides, + seed=llama_cpp_seed, + n_ctx=llama_cpp_n_ctx, + n_batch=llama_cpp_n_batch, + n_threads=llama_cpp_n_threads, + n_threads_batch=llama_cpp_n_threads_batch, + rope_scaling_type=llama_cpp_rope_scaling_type, + rope_freq_base=llama_cpp_rope_freq_base, + rope_freq_scale=llama_cpp_rope_freq_scale, + yarn_ext_factor=llama_cpp_yarn_ext_factor, + yarn_attn_factor=llama_cpp_yarn_attn_factor, + yarn_beta_fast=llama_cpp_yarn_beta_fast, + yarn_beta_slow=llama_cpp_yarn_beta_slow, + yarn_orig_ctx=llama_cpp_yarn_orig_ctx, + mul_mat_q=llama_cpp_mul_mat_q, + logits_all=llama_cpp_logits_all, + embedding=llama_cpp_embedding, + offload_kqv=llama_cpp_offload_kqv, + last_n_tokens_size=llama_cpp_last_n_tokens_size, + lora_base=llama_cpp_lora_base, + lora_scale=llama_cpp_lora_scale, + lora_path=llama_cpp_lora_path, + numa=llama_cpp_numa, + chat_format=llama_cpp_chat_format, + draft_model=llama_cpp_draft_model, + # tokenizer=llama_cpp_tokenizer, # TODO: support custom tokenizers for llama.cpp + verbose=llama_cpp_verbose, + **model_args, + ) + else: + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=compile, + **self.model_args, + ) + + def sequential_locker(): + if self.concurrent_queries: + sequential_lock.acquire() + + def sequential_unlocker(): + if self.concurrent_queries: + sequential_lock.release() + + def CORS(): + cherrypy.response.headers["Access-Control-Allow-Origin"] = cors_domain + cherrypy.response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS" + cherrypy.response.headers["Access-Control-Allow-Headers"] = "Content-Type" + cherrypy.response.headers["Access-Control-Allow-Credentials"] = "true" + + if cherrypy.request.method == "OPTIONS": + cherrypy.response.status = 200 + return True + + cherrypy.config.update( + { + "server.socket_host": "0.0.0.0", + "server.socket_port": port, + "log.screen": False, + "tools.CORS.on": True, + "error_page.400": error_page, + "error_page.401": error_page, + "error_page.402": error_page, + "error_page.403": error_page, + "error_page.404": error_page, + "error_page.405": error_page, + "error_page.406": error_page, + "error_page.408": error_page, + "error_page.415": error_page, + "error_page.429": error_page, + "error_page.500": error_page, + "error_page.501": error_page, + "error_page.502": error_page, + "error_page.503": error_page, + "error_page.504": error_page, + "error_page.default": error_page, + } + ) + + if username and password: + # Configure basic authentication + conf = { + "/": { + "tools.sequential_locker.on": True, + "tools.sequential_unlocker.on": True, + "tools.auth_basic.on": True, + "tools.auth_basic.realm": "geniusrise", + "tools.auth_basic.checkpassword": self.validate_password, + "tools.CORS.on": True, + } + } + else: + # Configuration without authentication + conf = { + "/": { + "tools.sequential_locker.on": True, + "tools.sequential_unlocker.on": True, + "tools.CORS.on": True, + } + } + + cherrypy.tools.sequential_locker = cherrypy.Tool("before_handler", sequential_locker) + cherrypy.tools.CORS = cherrypy.Tool("before_handler", CORS) + cherrypy.tree.mount(self, "/api/v1/", conf) + cherrypy.tools.CORS = cherrypy.Tool("before_finalize", CORS) + cherrypy.tools.sequential_unlocker = cherrypy.Tool("before_finalize", sequential_unlocker) + cherrypy.engine.start() + cherrypy.engine.block() + + +def error_page(status, message, traceback, version): + response = { + "status": status, + "message": message, + } + return json.dumps(response) diff --git a/geniusrise_text/base/bulk.py b/geniusrise_text/base/bulk.py new file mode 100644 index 0000000..bc6349d --- /dev/null +++ b/geniusrise_text/base/bulk.py @@ -0,0 +1,938 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List, Optional, Tuple, Union +import os +import torch +import transformers +from geniusrise import BatchInput, BatchOutput, Bolt, State +from geniusrise.logging import setup_logger +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BeamSearchScorer, + LogitsProcessorList, + MinLengthLogitsProcessor, +) +from optimum.bettertransformer import BetterTransformer +from vllm.config import ( + ModelConfig as VLLMModelConfig, + CacheConfig, + ParallelConfig, + SchedulerConfig, + DeviceConfig, + LoRAConfig, +) + +from vllm import LLM, AsyncLLMEngine +from ray.util.placement_group import PlacementGroup +import llama_cpp +from llama_cpp import Llama as LlamaCPP +from transformers.tokenization_utils_base import PreTrainedTokenizerBase + +from geniusrise_text.base.communication import send_email + + +class TextBulk(Bolt): + """ + TextBulk is a foundational class for enabling bulk processing of text with various generation models. + It primarily focuses on using Hugging Face models to provide a robust and efficient framework for + large-scale text generation tasks. The class supports various decoding strategies to generate text + that can be tailored to specific needs or preferences. + + Attributes: + model (AutoModelForCausalLM): The language model for text generation. + tokenizer (AutoTokenizer): The tokenizer for preparing input data for the model. + + Args: + input (BatchInput): Configuration and data inputs for the batch process. + output (BatchOutput): Configurations for output data handling. + state (State): State management for the Bolt. + **kwargs: Arbitrary keyword arguments for extended configurations. + + Methods: + text(**kwargs: Any) -> Dict[str, Any]: + Provides an API endpoint for text generation functionality. + Accepts various parameters for customizing the text generation process. + + generate(prompt: str, decoding_strategy: str = "generate", **generation_params: Any) -> dict: + Generates text based on the provided prompt and parameters. Supports multiple decoding strategies for diverse applications. + + The class serves as a versatile tool for text generation, supporting various models and configurations. + It can be extended or used as is for efficient text generation tasks. + """ + + model: Any + tokenizer: Any + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs, + ): + """ + Initializes the TextBulk with configurations and sets up logging. It prepares the environment for text generation tasks. + + Args: + input (BatchInput): The input data configuration for the text generation task. + output (BatchOutput): The output data configuration for the results of the text generation. + state (State): The state configuration for the Bolt, managing its operational status. + **kwargs: Additional keyword arguments for extended functionality and model configurations. + """ + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + + def generate( + self, + prompt: str, + decoding_strategy: str = "generate", + **generation_params: Any, + ) -> str: + r""" + Generate text completion for the given prompt using the specified decoding strategy. + + Args: + prompt (str): The prompt to generate text completion for. + decoding_strategy (str, optional): The decoding strategy to use. Defaults to "generate". + **generation_params (Any): Additional parameters to pass to the decoding strategy. + + Returns: + str: The generated text completion. + + Raises: + Exception: If an error occurs during generation. + + Supported decoding strategies and their additional parameters: + - "generate": Uses the model's default generation method. (Parameters: max_length, num_beams, etc.) + - "greedy_search": Generates text using a greedy search decoding strategy. + Parameters: max_length, eos_token_id, pad_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus. + - "contrastive_search": Generates text using contrastive search decoding strategy. + Parameters: top_k, penalty_alpha, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, sequential. + - "sample": Generates text using a sampling decoding strategy. + Parameters: do_sample, temperature, top_k, top_p, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus. + - "beam_search": Generates text using beam search decoding strategy. + Parameters: num_beams, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus. + - "beam_sample": Generates text using beam search with sampling decoding strategy. + Parameters: num_beams, temperature, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus. + - "group_beam_search": Generates text using group beam search decoding strategy. + Parameters: num_beams, diversity_penalty, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus. + - "constrained_beam_search": Generates text using constrained beam search decoding strategy. + Parameters: num_beams, max_length, constraints, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus. + + All generation parameters: + - max_length: Maximum length the generated tokens can have + - max_new_tokens: Maximum number of tokens to generate, ignoring prompt tokens + - min_length: Minimum length of the sequence to be generated + - min_new_tokens: Minimum number of tokens to generate, ignoring prompt tokens + - early_stopping: Stopping condition for beam-based methods + - max_time: Maximum time allowed for computation in seconds + - do_sample: Whether to use sampling for generation + - num_beams: Number of beams for beam search + - num_beam_groups: Number of groups for beam search to ensure diversity + - penalty_alpha: Balances model confidence and degeneration penalty in contrastive search + - use_cache: Whether the model should use past key/values attentions to speed up decoding + - temperature: Modulates next token probabilities + - top_k: Number of highest probability tokens to keep for top-k-filtering + - top_p: Smallest set of most probable tokens with cumulative probability >= top_p + - typical_p: Conditional probability of predicting a target token next + - epsilon_cutoff: Tokens with a conditional probability > epsilon_cutoff will be sampled + - eta_cutoff: Eta sampling, a hybrid of locally typical sampling and epsilon sampling + - diversity_penalty: Penalty subtracted from a beam's score if it generates a token same as any other group + - repetition_penalty: Penalty for repetition of ngrams + - encoder_repetition_penalty: Penalty on sequences not in the original input + - length_penalty: Exponential penalty to the length for beam-based generation + - no_repeat_ngram_size: All ngrams of this size can only occur once + - bad_words_ids: List of token ids that are not allowed to be generated + - force_words_ids: List of token ids that must be generated + - renormalize_logits: Renormalize the logits after applying all logits processors + - constraints: Custom constraints for generation + - forced_bos_token_id: Token ID to force as the first generated token + - forced_eos_token_id: Token ID to force as the last generated token + - remove_invalid_values: Remove possible NaN and inf outputs + - exponential_decay_length_penalty: Exponentially increasing length penalty after a certain number of tokens + - suppress_tokens: Tokens that will be suppressed during generation + - begin_suppress_tokens: Tokens that will be suppressed at the beginning of generation + - forced_decoder_ids: Mapping from generation indices to token indices that will be forced + - sequence_bias: Maps a sequence of tokens to its bias term + - guidance_scale: Guidance scale for classifier free guidance (CFG) + - low_memory: Switch to sequential topk for contrastive search to reduce peak memory + - num_return_sequences: Number of independently computed returned sequences for each batch element + - output_attentions: Whether to return the attentions tensors of all layers + - output_hidden_states: Whether to return the hidden states of all layers + - output_scores: Whether to return the prediction scores + - return_dict_in_generate: Whether to return a ModelOutput instead of a plain tuple + - pad_token_id: The id of the padding token + - bos_token_id: The id of the beginning-of-sequence token + - eos_token_id: The id of the end-of-sequence token + - max_length: The maximum length of the sequence to be generated + - eos_token_id: End-of-sequence token ID + - pad_token_id: Padding token ID + - output_attentions: Return attention tensors of all attention layers if True + - output_hidden_states: Return hidden states of all layers if True + - output_scores: Return prediction scores if True + - return_dict_in_generate: Return a ModelOutput instead of a plain tuple if True + - synced_gpus: Continue running the while loop until max_length for ZeRO stage 3 if True + - top_k: Size of the candidate set for re-ranking in contrastive search + - penalty_alpha: Degeneration penalty; active when larger than 0 + - eos_token_id: End-of-sequence token ID(s) + - sequential: Switch to sequential topk hidden state computation to reduce memory if True + - do_sample: Use sampling for generation if True + - temperature: Temperature for sampling + - top_p: Cumulative probability for top-p-filtering + - diversity_penalty: Penalty for reducing similarity across different beam groups + - constraints: List of constraints to apply during beam search + - synced_gpus: Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + """ + results: Dict[int, Dict[str, Union[str, List[str]]]] = {} + eos_token_id = self.model.config.eos_token_id + pad_token_id = self.model.config.pad_token_id + if not pad_token_id: + pad_token_id = eos_token_id + self.model.config.pad_token_id = pad_token_id + + # Default parameters for each strategy + default_params = { + "generate": { + "max_length": 20, # Maximum length the generated tokens can have + "max_new_tokens": None, # Maximum number of tokens to generate, ignoring prompt tokens + "min_length": 0, # Minimum length of the sequence to be generated + "min_new_tokens": None, # Minimum number of tokens to generate, ignoring prompt tokens + "early_stopping": False, # Stopping condition for beam-based methods + "max_time": None, # Maximum time allowed for computation in seconds + "do_sample": False, # Whether to use sampling for generation + "num_beams": 1, # Number of beams for beam search + "num_beam_groups": 1, # Number of groups for beam search to ensure diversity + "penalty_alpha": None, # Balances model confidence and degeneration penalty in contrastive search + "use_cache": True, # Whether the model should use past key/values attentions to speed up decoding + "temperature": 1.0, # Modulates next token probabilities + "top_k": 50, # Number of highest probability tokens to keep for top-k-filtering + "top_p": 1.0, # Smallest set of most probable tokens with cumulative probability >= top_p + "typical_p": 1.0, # Conditional probability of predicting a target token next + "epsilon_cutoff": 0.0, # Tokens with a conditional probability > epsilon_cutoff will be sampled + "eta_cutoff": 0.0, # Eta sampling, a hybrid of locally typical sampling and epsilon sampling + "diversity_penalty": 0.0, # Penalty subtracted from a beam's score if it generates a token same as any other group + "repetition_penalty": 1.0, # Penalty for repetition of ngrams + "encoder_repetition_penalty": 1.0, # Penalty on sequences not in the original input + "length_penalty": 1.0, # Exponential penalty to the length for beam-based generation + "no_repeat_ngram_size": 0, # All ngrams of this size can only occur once + "bad_words_ids": None, # List of token ids that are not allowed to be generated + "force_words_ids": None, # List of token ids that must be generated + "renormalize_logits": False, # Renormalize the logits after applying all logits processors + "constraints": None, # Custom constraints for generation + "forced_bos_token_id": None, # Token ID to force as the first generated token + "forced_eos_token_id": None, # Token ID to force as the last generated token + "remove_invalid_values": False, # Remove possible NaN and inf outputs + "exponential_decay_length_penalty": None, # Exponentially increasing length penalty after a certain number of tokens + "suppress_tokens": None, # Tokens that will be suppressed during generation + "begin_suppress_tokens": None, # Tokens that will be suppressed at the beginning of generation + "forced_decoder_ids": None, # Mapping from generation indices to token indices that will be forced + "sequence_bias": None, # Maps a sequence of tokens to its bias term + "guidance_scale": None, # Guidance scale for classifier free guidance (CFG) + "low_memory": None, # Switch to sequential topk for contrastive search to reduce peak memory + "num_return_sequences": 1, # Number of independently computed returned sequences for each batch element + "output_attentions": False, # Whether to return the attentions tensors of all layers + "output_hidden_states": False, # Whether to return the hidden states of all layers + "output_scores": False, # Whether to return the prediction scores + "return_dict_in_generate": False, # Whether to return a ModelOutput instead of a plain tuple + "pad_token_id": None, # The id of the padding token + "bos_token_id": None, # The id of the beginning-of-sequence token + "eos_token_id": None, # The id of the end-of-sequence token + }, + "greedy_search": { + "max_length": 4096, # The maximum length of the sequence to be generated + "eos_token_id": eos_token_id, # End-of-sequence token ID + "pad_token_id": pad_token_id, # Padding token ID + "output_attentions": False, # Return attention tensors of all attention layers if True + "output_hidden_states": False, # Return hidden states of all layers if True + "output_scores": False, # Return prediction scores if True + "return_dict_in_generate": False, # Return a ModelOutput instead of a plain tuple if True + "synced_gpus": False, # Continue running the while loop until max_length for ZeRO stage 3 if True + }, + "contrastive_search": { + "top_k": 1, # Size of the candidate set for re-ranking in contrastive search + "penalty_alpha": 0, # Degeneration penalty; active when larger than 0 + "pad_token_id": pad_token_id, # Padding token ID + "eos_token_id": eos_token_id, # End-of-sequence token ID(s) + "output_attentions": False, # Return attention tensors of all attention layers if True + "output_hidden_states": False, # Return hidden states of all layers if True + "output_scores": False, # Return prediction scores if True + "return_dict_in_generate": False, # Return a ModelOutput instead of a plain tuple if True + "synced_gpus": False, # Continue running the while loop until max_length for ZeRO stage 3 if True + "sequential": False, # Switch to sequential topk hidden state computation to reduce memory if True + }, + "sample": { + "do_sample": True, # Use sampling for generation if True + "temperature": 0.6, # Temperature for sampling + "top_k": 50, # Number of highest probability tokens to keep for top-k-filtering + "top_p": 0.9, # Cumulative probability for top-p-filtering + "max_length": 4096, # The maximum length of the sequence to be generated + "pad_token_id": pad_token_id, # Padding token ID + "eos_token_id": eos_token_id, # End-of-sequence token ID(s) + "output_attentions": False, # Return attention tensors of all attention layers if True + "output_hidden_states": False, # Return hidden states of all layers if True + "output_scores": False, # Return prediction scores if True + "return_dict_in_generate": False, # Return a ModelOutput instead of a plain tuple if True + "synced_gpus": False, # Continue running the while loop until max_length for ZeRO stage 3 if True + }, + "beam_search": { + "num_beams": 4, # Number of beams for beam search + "max_length": 4096, # The maximum length of the sequence to be generated + "pad_token_id": pad_token_id, # Padding token ID + "eos_token_id": eos_token_id, # End-of-sequence token ID(s) + "output_attentions": False, # Return attention tensors of all attention layers if True + "output_hidden_states": False, # Return hidden states of all layers if True + "output_scores": False, # Return prediction scores if True + "return_dict_in_generate": False, # Return a ModelOutput instead of a plain tuple if True + "synced_gpus": False, # Continue running the while loop until max_length for ZeRO stage 3 if True + }, + "beam_sample": { + "num_beams": 4, # Number of beams for beam search + "temperature": 0.6, # Temperature for sampling + "max_length": 4096, # The maximum length of the sequence to be generated + "pad_token_id": pad_token_id, # Padding token ID + "eos_token_id": eos_token_id, # End-of-sequence token ID(s) + "output_attentions": False, # Return attention tensors of all attention layers if True + "output_hidden_states": False, # Return hidden states of all layers if True + "output_scores": False, # Return prediction scores if True + "return_dict_in_generate": False, # Return a ModelOutput instead of a plain tuple if True + "synced_gpus": False, # Continue running the while loop until max_length for ZeRO stage 3 if True + }, + "group_beam_search": { + "num_beams": 4, # Number of beams for beam search + "diversity_penalty": 0.5, # Penalty for reducing similarity across different beam groups + "max_length": 4096, # The maximum length of the sequence to be generated + "pad_token_id": pad_token_id, # Padding token ID + "eos_token_id": eos_token_id, # End-of-sequence token ID(s) + "output_attentions": False, # Return attention tensors of all attention layers if True + "output_hidden_states": False, # Return hidden states of all layers if True + "output_scores": False, # Return prediction scores if True + "return_dict_in_generate": False, # Return a ModelOutput instead of a plain tuple if True + "synced_gpus": False, # Continue running the while loop until max_length for ZeRO stage 3 if True + }, + "constrained_beam_search": { + "num_beams": 4, # Number of beams for beam search + "max_length": 4096, # The maximum length of the sequence to be generated + "constraints": None, # List of constraints to apply during beam search + "pad_token_id": pad_token_id, # Padding token ID + "eos_token_id": eos_token_id, # End-of-sequence token ID(s) + "output_attentions": False, # Return attention tensors of all attention layers if True + "output_hidden_states": False, # Return hidden states of all layers if True + "output_scores": False, # Return prediction scores if True + "return_dict_in_generate": False, # Return a ModelOutput instead of a plain tuple if True + "synced_gpus": False, # Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + }, + } + + # Merge default params with user-provided params + strategy_params = {**default_params.get(decoding_strategy, {})} + for k, v in generation_params.items(): + if k in strategy_params: + strategy_params[k] = v + + # Prepare LogitsProcessorList and BeamSearchScorer for beam search strategies + if decoding_strategy in ["beam_search", "beam_sample", "group_beam_search"]: + logits_processor = LogitsProcessorList( + [MinLengthLogitsProcessor(min_length=strategy_params.get("min_length", 0), eos_token_id=eos_token_id)] + ) + beam_scorer = BeamSearchScorer( + batch_size=1, + max_length=strategy_params.get("max_length", 20), + num_beams=strategy_params.get("num_beams", 1), + device=self.model.device, + length_penalty=strategy_params.get("length_penalty", 1.0), + do_early_stopping=strategy_params.get("early_stopping", False), + ) + strategy_params.update({"logits_processor": logits_processor, "beam_scorer": beam_scorer}) + + if decoding_strategy == "beam_sample": + strategy_params.update({"logits_warper": LogitsProcessorList()}) + + # Map of decoding strategy to method + strategy_to_method = { + "generate": self.model.generate, + "greedy_search": self.model.greedy_search, + "contrastive_search": self.model.contrastive_search, + "sample": self.model.sample, + "beam_search": self.model.beam_search, + "beam_sample": self.model.beam_sample, + "group_beam_search": self.model.group_beam_search, + "constrained_beam_search": self.model.constrained_beam_search, + } + + try: + self.log.debug(f"Generating completion for prompt {prompt}") + + inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) + input_ids = inputs["input_ids"] + input_ids = input_ids.to(self.model.device) + + # Replicate input_ids for beam search + if decoding_strategy in ["beam_search", "beam_sample", "group_beam_search"]: + num_beams = strategy_params.get("num_beams", 1) + input_ids = input_ids.repeat(num_beams, 1) + + # Use the specified decoding strategy + decoding_method = strategy_to_method.get(decoding_strategy, self.model.generate) + generated_ids = decoding_method(input_ids, **strategy_params) + + generated_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True) + self.log.debug(f"Generated text: {generated_text}") + + return generated_text + + except Exception as e: + self.log.exception(f"An error occurred: {e}") + raise + + def _get_torch_dtype(self, precision: str) -> torch.dtype: + """ + Determines the torch dtype based on the specified precision. + + Args: + precision (str): The desired precision for computations. + + Returns: + torch.dtype: The corresponding torch dtype. + + Raises: + ValueError: If an unsupported precision is specified. + """ + dtype_map = { + "float32": torch.float32, + "float": torch.float, + "float64": torch.float64, + "double": torch.double, + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "half": torch.half, + "uint8": torch.uint8, + "int8": torch.int8, + "int16": torch.int16, + "short": torch.short, + "int32": torch.int32, + "int": torch.int, + "int64": torch.int64, + "quint8": torch.quint8, + "qint8": torch.qint8, + "qint32": torch.qint32, + } + return dtype_map.get(precision, torch.float) + + def load_models( + self, + model_name: str, + tokenizer_name: str, + model_revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + better_transformers: bool = False, + **model_args: Any, + ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: + """ + Loads and configures the specified model and tokenizer for text generation. It ensures the models are optimized for inference. + + Args: + model_name (str): The name or path of the model to load. + tokenizer_name (str): The name or path of the tokenizer to load. + model_revision (Optional[str]): The specific model revision to load (e.g., a commit hash). + tokenizer_revision (Optional[str]): The specific tokenizer revision to load (e.g., a commit hash). + model_class (str): The class of the model to be loaded. + tokenizer_class (str): The class of the tokenizer to be loaded. + use_cuda (bool): Flag to utilize CUDA for GPU acceleration. + precision (str): The desired precision for computations ("float32", "float16", etc.). + quantization (int): The bit level for model quantization (0 for none, 8 for 8-bit quantization). + device_map (str | Dict | None): The specific device(s) to use for model operations. + max_memory (Dict): A dictionary defining the maximum memory to allocate for the model. + torchscript (bool): Flag to enable TorchScript for model optimization. + compile (bool): Flag to enable JIT compilation of the model. + awq_enabled (bool): Flag to enable AWQ (Adaptive Weight Quantization). + flash_attention (bool): Flag to enable Flash Attention optimization for faster processing. + better_transformers (bool): Flag to enable Better Transformers optimization for faster processing. + **model_args (Any): Additional arguments to pass to the model during its loading. + + Returns: + Tuple[AutoModelForCausalLM, AutoTokenizer]: The loaded model and tokenizer ready for text generation. + """ + self.log.info(f"Loading Hugging Face model: {model_name}") + + # Determine the torch dtype based on precision + torch_dtype = self._get_torch_dtype(precision) + + if use_cuda and not device_map: + device_map = "auto" + + if awq_enabled: + ModelClass = AutoModelForCausalLM + self.log.info("AWQ Enabled: Loading AWQ Model") + else: + ModelClass = getattr(transformers, model_class) + TokenizerClass = getattr(transformers, tokenizer_class) + + # Load the model and tokenizer + if model_name == "local": + tokenizer = TokenizerClass.from_pretrained( + os.path.join(self.input.get(), "/model"), torch_dtype=torch_dtype + ) + else: + tokenizer = TokenizerClass.from_pretrained( + tokenizer_name, revision=tokenizer_revision, torch_dtype=torch_dtype + ) + + if flash_attention: + model_args = {**model_args, **{"attn_implementation": "flash_attention_2"}} + + self.log.info(f"Loading model from {model_name} {model_revision} with {model_args}") + if awq_enabled and quantization > 0: + if model_name == "local": + model = ModelClass.from_pretrained( + os.path.join(self.input.get(), "/model"), + torch_dtype=torch_dtype, + **model_args, + ) + else: + model = ModelClass.from_pretrained( + model_name, + revision=model_revision, + torch_dtype=torch_dtype, + **model_args, + ) + elif quantization == 8: + if model_name == "local": + model = ModelClass.from_pretrained( + os.path.join(self.input.get(), "/model"), + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + load_in_8bit=True, + **model_args, + ) + else: + model = ModelClass.from_pretrained( + model_name, + revision=model_revision, + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + load_in_8bit=True, + **model_args, + ) + elif quantization == 4: + if model_name == "local": + model = ModelClass.from_pretrained( + os.path.join(self.input.get(), "/model"), + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + load_in_4bit=True, + **model_args, + ) + else: + model = ModelClass.from_pretrained( + model_name, + revision=model_revision, + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + load_in_4bit=True, + **model_args, + ) + else: + if model_name == "local": + model = ModelClass.from_pretrained( + os.path.join(self.input.get(), "/model"), + torch_dtype=torch_dtype, + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + **model_args, + ) + else: + model = ModelClass.from_pretrained( + model_name, + revision=model_revision, + torch_dtype=torch_dtype, + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + **model_args, + ) + + if compile and not torchscript: + model = torch.compile(model) + + if better_transformers: + model = BetterTransformer.transform(model, keep_original_model=True) + + # Set to evaluation mode for inference + model.eval() + + if tokenizer and tokenizer.eos_token and (not tokenizer.pad_token): + tokenizer.pad_token = tokenizer.eos_token + + eos_token_id = model.config.eos_token_id + pad_token_id = model.config.pad_token_id + if not pad_token_id: + model.config.pad_token_id = eos_token_id + + self.log.debug("Text model and tokenizer loaded successfully.") + return model, tokenizer + + def load_models_vllm( + self, + model: str, + tokenizer: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = True, + download_dir: Optional[str] = None, + load_format: str = "auto", + dtype: Union[str, torch.dtype] = "auto", + seed: int = 42, + revision: Optional[str] = None, + # code_revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: int = 1024, + quantization: Optional[str] = None, + enforce_eager: bool = False, + max_context_len_to_capture: int = 8192, + block_size: int = 16, + gpu_memory_utilization: float = 0.90, + swap_space: int = 4, + cache_dtype: str = "auto", + sliding_window: Optional[int] = None, + pipeline_parallel_size: int = 1, + tensor_parallel_size: int = 1, + worker_use_ray: bool = False, + max_parallel_loading_workers: Optional[int] = None, + disable_custom_all_reduce: bool = False, + max_num_batched_tokens: Optional[int] = None, + max_num_seqs: int = 64, + max_paddings: int = 512, + device: str = "cuda", + max_lora_rank: Optional[int] = None, + max_loras: Optional[int] = None, + max_cpu_loras: Optional[int] = None, + lora_dtype: Optional[torch.dtype] = None, + lora_extra_vocab_size: int = 0, + placement_group: Optional[PlacementGroup] = None, + log_stats: bool = False, + batched_inference: bool = False, + ) -> AsyncLLMEngine | LLM: + """ + Initializes and loads models using VLLM configurations with specific parameters. + + Args: + model (str): Name or path of the Hugging Face model to use. + tokenizer (str): Name or path of the Hugging Face tokenizer to use. + tokenizer_mode (str): Tokenizer mode. "auto" will use the fast tokenizer if available, "slow" will always use the slow tokenizer. + trust_remote_code (bool): Trust remote code (e.g., from Hugging Face) when downloading the model and tokenizer. + download_dir (Optional[str]): Directory to download and load the weights, default to the default cache directory of Hugging Face. + load_format (str): The format of the model weights to load. Options include "auto", "pt", "safetensors", "npcache", "dummy". + dtype (Union[str, torch.dtype]): Data type for model weights and activations. Options include "auto", torch.float32, torch.float16, etc. + seed (int): Random seed for reproducibility. + revision (Optional[str]): The specific model version to use. Can be a branch name, a tag name, or a commit id. + code_revision (Optional[str]): The specific revision to use for the model code on Hugging Face Hub. + tokenizer_revision (Optional[str]): The specific tokenizer version to use. + max_model_len (Optional[int]): Maximum length of a sequence (including prompt and output). If None, will be derived from the model. + quantization (Optional[str]): Quantization method that was used to quantize the model weights. If None, we assume the model weights are not quantized. + enforce_eager (bool): Whether to enforce eager execution. If True, disables CUDA graph and always execute the model in eager mode. + max_context_len_to_capture (Optional[int]): Maximum context length covered by CUDA graphs. When larger, falls back to eager mode. + block_size (int): Size of a cache block in number of tokens. + gpu_memory_utilization (float): Fraction of GPU memory to use for the VLLM execution. + swap_space (int): Size of the CPU swap space per GPU (in GiB). + cache_dtype (str): Data type for KV cache storage. + sliding_window (Optional[int]): Configuration for sliding window if applicable. + pipeline_parallel_size (int): Number of pipeline parallel groups. + tensor_parallel_size (int): Number of tensor parallel groups. + worker_use_ray (bool): Whether to use Ray for model workers. Required if either pipeline_parallel_size or tensor_parallel_size is greater than 1. + max_parallel_loading_workers (Optional[int]): Maximum number of workers for loading the model in parallel to avoid RAM OOM. + disable_custom_all_reduce (bool): Disable custom all-reduce kernel and fall back to NCCL. + max_num_batched_tokens (Optional[int]): Maximum number of tokens to be processed in a single iteration. + max_num_seqs (int): Maximum number of sequences to be processed in a single iteration. + max_paddings (int): Maximum number of paddings to be added to a batch. + device (str): Device configuration, typically "cuda" or "cpu". + max_lora_rank (Optional[int]): Maximum rank for LoRA adjustments. + max_loras (Optional[int]): Maximum number of LoRA adjustments. + max_cpu_loras (Optional[int]): Maximum number of LoRA adjustments stored on CPU. + lora_dtype (Optional[torch.dtype]): Data type for LoRA parameters. + lora_extra_vocab_size (Optional[int]): Additional vocabulary size for LoRA. + placement_group (Optional["PlacementGroup"]): Ray placement group for distributed execution. Required for distributed execution. + log_stats (bool): Whether to log statistics during model operation. + + Returns: + LLMEngine: An instance of the LLMEngine class initialized with the given configurations. + """ + + vllm_model_config = VLLMModelConfig( + model=model, + tokenizer=tokenizer, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + download_dir=download_dir, + load_format=load_format, + dtype=dtype, + seed=seed, + revision=revision, + # code_revision=code_revision, + tokenizer_revision=tokenizer_revision, + max_model_len=max_model_len, + quantization=quantization, + enforce_eager=enforce_eager, + max_context_len_to_capture=max_context_len_to_capture, + ) + + vllm_cache_config = CacheConfig( + block_size=block_size, + gpu_memory_utilization=gpu_memory_utilization, + swap_space=swap_space, + cache_dtype=cache_dtype, + sliding_window=sliding_window, + ) + + vllm_parallel_config = ParallelConfig( + pipeline_parallel_size=pipeline_parallel_size, + tensor_parallel_size=tensor_parallel_size, + worker_use_ray=worker_use_ray, + max_parallel_loading_workers=max_parallel_loading_workers, + disable_custom_all_reduce=disable_custom_all_reduce, + ) + + vllm_scheduler_config = SchedulerConfig( + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs, + max_model_len=max_model_len, # type: ignore + max_paddings=max_paddings, + ) + + vllm_device_config = DeviceConfig(device=device) + + vllm_lora_config = None + if max_lora_rank is not None and max_loras is not None: + vllm_lora_config = LoRAConfig( + max_lora_rank=max_lora_rank, + max_loras=max_loras, + max_cpu_loras=max_cpu_loras, + lora_dtype=lora_dtype, + lora_extra_vocab_size=lora_extra_vocab_size, + ) + + engine: AsyncLLMEngine | LLM + if not batched_inference: + engine = AsyncLLMEngine( + worker_use_ray=worker_use_ray, + engine_use_ray=placement_group is not None, + log_requests=True, + start_engine_loop=True, + model_config=vllm_model_config, + cache_config=vllm_cache_config, + parallel_config=vllm_parallel_config, + scheduler_config=vllm_scheduler_config, + device_config=vllm_device_config, + lora_config=vllm_lora_config, + placement_group=placement_group, + log_stats=log_stats, + ) + else: + engine = LLM( + model=model, + tokenizer=tokenizer, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + download_dir=download_dir, + load_format=load_format, + dtype=dtype, # type: ignore + kv_cache_dtype=cache_dtype, + seed=seed, + max_model_len=max_model_len, + pipeline_parallel_size=pipeline_parallel_size, + tensor_parallel_size=tensor_parallel_size, + worker_use_ray=worker_use_ray, + max_parallel_loading_workers=max_parallel_loading_workers, + block_size=block_size, + gpu_memory_utilization=gpu_memory_utilization, + swap_space=swap_space, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs, + max_paddings=max_paddings, + revision=revision, + # code_revision=code_revision, + tokenizer_revision=tokenizer_revision, + quantization=quantization, + enforce_eager=enforce_eager, + max_context_len_to_capture=max_context_len_to_capture, + disable_custom_all_reduce=disable_custom_all_reduce, + enable_lora=max_lora_rank is not None, + max_loras=max_loras, + max_lora_rank=max_lora_rank, + lora_extra_vocab_size=lora_extra_vocab_size, + max_cpu_loras=max_cpu_loras, + device=device, + ) + + self.log.info("VLLM model loaded successfully.") + return engine + + def load_models_llama_cpp( + self, + model: str, + filename: Optional[str], + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + n_gpu_layers: int = 0, + split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER, + main_gpu: int = 0, + tensor_split: Optional[List[float]] = None, + vocab_only: bool = False, + use_mmap: bool = True, + use_mlock: bool = False, + kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None, + seed: int = llama_cpp.LLAMA_DEFAULT_SEED, + n_ctx: int = 512, + n_batch: int = 512, + n_threads: Optional[int] = None, + n_threads_batch: Optional[int] = None, + rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED, + rope_freq_base: float = 0.0, + rope_freq_scale: float = 0.0, + yarn_ext_factor: float = -1.0, + yarn_attn_factor: float = 1.0, + yarn_beta_fast: float = 32.0, + yarn_beta_slow: float = 1.0, + yarn_orig_ctx: int = 0, + mul_mat_q: bool = True, + logits_all: bool = False, + embedding: bool = False, + offload_kqv: bool = True, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_scale: float = 1.0, + lora_path: Optional[str] = None, + numa: Union[bool, int] = False, + chat_format: Optional[str] = None, + chat_handler: Optional[llama_cpp.llama_chat_format.LlamaChatCompletionHandler] = None, + draft_model: Optional[llama_cpp.LlamaDraftModel] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + verbose: bool = True, + **kwargs, + ) -> Tuple[LlamaCPP, Optional[PreTrainedTokenizerBase]]: + """ + Initializes and loads LLaMA model with llama.cpp backend, along with an optional tokenizer. + + Args: + model (str): Huggingface ID to the LLaMA model. + filename: A filename or glob pattern to match the model file in the repo. + local_dir: The local directory to save the model to. + n_gpu_layers (int): Number of layers to offload to GPU. Default is 0. + split_mode (int): Split mode for distributing model across GPUs. + main_gpu (int): Main GPU index. + tensor_split (Optional[List[float]]): Tensor split configuration. + vocab_only (bool): Whether to load vocabulary only. + use_mmap (bool): Use memory-mapped files for model loading. + use_mlock (bool): Lock model data in RAM. + kv_overrides (Optional[Dict[str, Union[bool, int, float]]]): Key-value pairs for model overrides. + seed (int): Random seed for initialization. + n_ctx (int): Number of context tokens. + n_batch (int): Batch size for processing prompts. + n_threads (Optional[int]): Number of threads for generation. + n_threads_batch (Optional[int]): Number of threads for batch processing. + rope_scaling_type (Optional[int]): RoPE scaling type. + rope_freq_base (float): Base frequency for RoPE. + rope_freq_scale (float): Frequency scaling for RoPE. + yarn_ext_factor (float): YaRN extrapolation mix factor. + yarn_attn_factor (float): YaRN attention factor. + yarn_beta_fast (float): YaRN beta fast parameter. + yarn_beta_slow (float): YaRN beta slow parameter. + yarn_orig_ctx (int): Original context size for YaRN. + mul_mat_q (bool): Whether to multiply matrices for queries. + logits_all (bool): Return logits for all tokens. + embedding (bool): Enable embedding mode only. + offload_kqv (bool): Offload K, Q, V matrices to GPU. + last_n_tokens_size (int): Size for the last_n_tokens buffer. + lora_base (Optional[str]): Base model path for LoRA. + lora_scale (float): Scale factor for LoRA adjustments. + lora_path (Optional[str]): Path to LoRA adjustments. + numa (Union[bool, int]): NUMA configuration. + chat_format (Optional[str]): Chat format configuration. + chat_handler (Optional[llama_cpp.LlamaChatCompletionHandler]): Handler for chat completions. + draft_model (Optional[llama_cpp.LlamaDraftModel]): Draft model for speculative decoding. + tokenizer (Optional[PreTrainedTokenizerBase]): Custom tokenizer instance. + verbose (bool): Enable verbose logging. + **kwargs: Additional keyword arguments. + + Returns: + Tuple[LlamaCPP, Optional[PreTrainedTokenizerBase]]: The loaded LLaMA model and tokenizer. + """ + self.log.info(f"Loading LLaMA model from {model} with llama.cpp backend.") + + llama_model = LlamaCPP.from_pretrained( + repo_id=model, + filename=filename, + local_dir=local_dir, + n_gpu_layers=n_gpu_layers, + split_mode=split_mode, + main_gpu=main_gpu, + tensor_split=tensor_split, + vocab_only=vocab_only, + use_mmap=use_mmap, + use_mlock=use_mlock, + kv_overrides=kv_overrides, + seed=seed, + n_ctx=n_ctx, + n_batch=n_batch, + n_threads=n_threads, + n_threads_batch=n_threads_batch, + rope_scaling_type=rope_scaling_type, + rope_freq_base=rope_freq_base, + rope_freq_scale=rope_freq_scale, + yarn_ext_factor=yarn_ext_factor, + yarn_attn_factor=yarn_attn_factor, + yarn_beta_fast=yarn_beta_fast, + yarn_beta_slow=yarn_beta_slow, + yarn_orig_ctx=yarn_orig_ctx, + mul_mat_q=mul_mat_q, + logits_all=logits_all, + embedding=embedding, + offload_kqv=offload_kqv, + last_n_tokens_size=last_n_tokens_size, + lora_base=lora_base, + lora_scale=lora_scale, + lora_path=lora_path, + numa=numa, + chat_format=chat_format, + chat_handler=chat_handler, + draft_model=draft_model, + tokenizer=tokenizer, + verbose=verbose, + **kwargs, + ) + + self.log.info("LLaMA model loaded successfully.") + + return llama_model, tokenizer + + def done(self): + if self.notification_email: + self.output.flush() + send_email(recipient=self.notification_email, bucket_name=self.output.bucket, prefix=self.output.s3_folder) diff --git a/geniusrise_text/base/communication.py b/geniusrise_text/base/communication.py new file mode 100644 index 0000000..e1f3fcb --- /dev/null +++ b/geniusrise_text/base/communication.py @@ -0,0 +1,140 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +import boto3 + + +def create_presigned_urls(bucket_name: str, prefix: str) -> List[str]: + """ + Generate presigned URLs for all files in a specific S3 folder. + + :param bucket_name: Name of the S3 bucket + :param prefix: The common prefix of all keys you want to match, effectively a folder path in S3 + :return: List of URLs + """ + # Create a session using your AWS credentials + s3_client = boto3.client("s3") + presigned_urls = [] + + # List objects within a given prefix + response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix) + for content in response.get("Contents", []): + # Generate a presigned URL for each object + url = s3_client.generate_presigned_url( + "get_object", Params={"Bucket": bucket_name, "Key": content["Key"]}, ExpiresIn=86400 + ) # Link valid for 1 day + presigned_urls.append(url) + + return presigned_urls + + +def send_email( + recipient: str, bucket_name: str, prefix: str, from_email: str = "Geniusrise " +) -> None: + """ + Send a nicely formatted email with the list of downloadable links. + + :param recipient: Email address to send the links to + :param links: List of presigned URLs + :param from_email: The email address sending this email + """ + ses_client = boto3.client("ses") + + links = create_presigned_urls(bucket_name=bucket_name, prefix=prefix) + + # Email body + body_html = """ + + + +

🧠 Your Download Links from Geniusrise

+

We've prepared the files you requested. Below are the links to download them:

+ +

Please note that these links are valid for 24 hours only.

+

Thank you for using Geniusrise!

+ + + """ + + # Sending the email + try: + response = ses_client.send_email( + Source=from_email, + Destination={"ToAddresses": [recipient]}, + Message={ + "Subject": {"Data": "🧠 Your Download Links from Geniusrise"}, + "Body": {"Html": {"Data": body_html}}, + }, + ) + print(f"Email sent! Message ID: {response['MessageId']}") + except Exception as e: + print(f"An error occurred: {e}") + + +def send_fine_tuning_email( + recipient: str, bucket_name: str, prefix: str, from_email: str = "Geniusrise " +) -> None: + """ + Send a nicely formatted email with the list of downloadable links. + + :param recipient: Email address to send the links to + :param links: List of presigned URLs + :param from_email: The email address sending this email + """ + ses_client = boto3.client("ses") + + links = create_presigned_urls(bucket_name=bucket_name, prefix=prefix) + + # Email body + body_html = """ + + + +

🧠 Your Fine-Tuned Model Download Links from Geniusrise

+

We've prepared the models you requested. Below are the links to download them:

+ +

Please note that these links are valid for 24 hours only.

+

Thank you for using Geniusrise!

+ + + """ + + # Sending the email + try: + response = ses_client.send_email( + Source=from_email, + Destination={"ToAddresses": [recipient]}, + Message={ + "Subject": {"Data": "🧠 Your Fine-Tuned Model Download Links from Geniusrise"}, + "Body": {"Html": {"Data": body_html}}, + }, + ) + print(f"Email sent! Message ID: {response['MessageId']}") + except Exception as e: + print(f"An error occurred: {e}") diff --git a/geniusrise_text/base/fine_tune.py b/geniusrise_text/base/fine_tune.py new file mode 100644 index 0000000..861437a --- /dev/null +++ b/geniusrise_text/base/fine_tune.py @@ -0,0 +1,619 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from typing import Callable, Dict, List, Optional + +import numpy as np +import torch +from accelerate import infer_auto_device_map, init_empty_weights +from datasets import Dataset, DatasetDict +from geniusrise import BatchInput, BatchOutput, Bolt, State +from geniusrise.logging import setup_logger +from peft import LoraConfig, get_peft_model +from sklearn.metrics import accuracy_score, precision_recall_fscore_support +from transformers import AutoConfig, EvalPrediction, Trainer, TrainingArguments +from trl import SFTTrainer + +from geniusrise_text.base.communication import send_fine_tuning_email +from geniusrise_text.base.util import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING + + +class TextFineTuner(Bolt): + """ + A bolt for fine-tuning Hugging Face models. + + This bolt uses the Hugging Face Transformers library to fine-tune a pre-trained model. + It uses the `Trainer` class from the Transformers library to handle the training. + """ + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs, + ) -> None: + """ + Initialize the bolt. + + Args: + input (BatchInput): The batch input data. + output (BatchOutput): The output data. + state (State): The state manager. + evaluate (bool, optional): Whether to evaluate the model. Defaults to False. + **kwargs: Additional keyword arguments. + """ + super().__init__(input=input, output=output, state=state) + self.input = input + self.output = output + self.state = state + + self.log = setup_logger(self) + + @abstractmethod + def load_dataset(self, dataset_path: str, **kwargs) -> Dataset | DatasetDict | Optional[Dataset]: + """ + Load a dataset from a file. + + Args: + dataset_path (str): The path to the dataset file. + split (str, optional): The split to load. Defaults to None. + **kwargs: Additional keyword arguments to pass to the `load_dataset` method. + + Returns: + Union[Dataset, DatasetDict, None]: The loaded dataset. + + Raises: + NotImplementedError: This method should be overridden by subclasses. + """ + raise NotImplementedError("Subclasses should implement this!") + + def preprocess_data(self, **kwargs): + """Load and preprocess the dataset""" + try: + if self.use_huggingface_dataset: + _dataset = self.load_dataset(self.huggingface_dataset, **kwargs) + if self.evaluate: + _dataset = _dataset.train_test_split(test_size=0.2) + self.train_dataset = _dataset["train"] + self.eval_dataset = _dataset["test"] + else: + self.train_dataset = _dataset["train"] + self.eval_dataset = None + elif self.evaluate: + train_dataset_path = os.path.join(self.input.get(), "train") + eval_dataset_path = os.path.join(self.input.get(), "test") + self.train_dataset = self.load_dataset(train_dataset_path, **kwargs) + self.eval_dataset = self.load_dataset(eval_dataset_path, **kwargs) + else: + self.train_dataset = self.load_dataset(self.input.get(), **kwargs) + self.eval_dataset = None + except Exception as e: + self.log.exception(f"Failed to preprocess data: {e}") + raise e + + def load_models( + self, + model_name: str, + tokenizer_name: str, + model_class: str = "AutoModel", + tokenizer_class: str = "AutoTokenizer", + device_map: str | dict = "auto", + precision: str = "bfloat16", + quantization: Optional[int] = None, + lora_config: Optional[dict] = None, + use_accelerate: bool = False, + accelerate_no_split_module_classes: List[str] = [], + **kwargs, + ): + """ + Load the model and tokenizer. + + Args: + model_name (str): The name of the model to be loaded. + tokenizer_name (str, optional): The name of the tokenizer to be loaded. Defaults to None. + model_class (str, optional): The class of the model. Defaults to "AutoModel". + tokenizer_class (str, optional): The class of the tokenizer. Defaults to "AutoTokenizer". + device (Union[str, torch.device], optional): The device to be used. Defaults to "cuda". + precision (str, optional): The precision to be used. Choose from 'float32', 'float16', 'bfloat16'. Defaults to "float32". + quantization (Optional[int], optional): The quantization to be used. Defaults to None. + lora_config (Optional[dict], optional): The LoRA configuration to be used. Defaults to None. + use_accelerate (bool, optional): Whether to use accelerate. Defaults to False. + accelerate_no_split_module_classes (List[str], optional): The list of no split module classes to be used. Defaults to []. + **kwargs: Additional keyword arguments. + + Raises: + ValueError: If an unsupported precision is chosen. + + Returns: + None + """ + try: + # Determine the torch dtype based on precision + if precision == "float32": + torch_dtype = torch.float32 + elif precision == "float": + torch_dtype = torch.float + elif precision == "float64": + torch_dtype = torch.float64 + elif precision == "double": + torch_dtype = torch.double + elif precision == "float16": + torch_dtype = torch.float16 + elif precision == "bfloat16": + torch_dtype = torch.bfloat16 + elif precision == "half": + torch_dtype = torch.half + elif precision == "uint8": + torch_dtype = torch.uint8 + elif precision == "int8": + torch_dtype = torch.int8 + elif precision == "int16": + torch_dtype = torch.int16 + elif precision == "short": + torch_dtype = torch.short + elif precision == "int32": + torch_dtype = torch.int32 + elif precision == "int": + torch_dtype = torch.int + elif precision == "int64": + torch_dtype = torch.int64 + elif precision == "quint8": + torch_dtype = torch.quint8 + elif precision == "qint8": + torch_dtype = torch.qint8 + elif precision == "qint32": + torch_dtype = torch.qint32 + else: + torch_dtype = None + + peft_target_modules = [] + if ":" in model_name: + model_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + else: + model_revision = None + self.model_name = model_name + self.log.info(f"Loading model {model_name} and branch {model_revision}") + + with init_empty_weights(): + model = getattr(__import__("transformers"), str(model_class)).from_pretrained( + model_name, revision=model_revision, device_map=device_map + ) + known_targets = [ + v + for k, v in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.items() + if k.lower() in model_name.lower() + ] + if len(known_targets) > 0: + peft_target_modules = known_targets[0] + else: + # very generic strategy, may lead to VRAM usage explosion on the wrong model erasing all advantage + for name, module in model.named_modules(): + if isinstance(module, (torch.nn.Linear, torch.nn.Conv1d)) and "head" not in name: + name = name.split(".")[-1] + if name not in peft_target_modules: + peft_target_modules.append(name) + self.log.info(f"Targeting these modules for PEFT: {peft_target_modules}") + + if use_accelerate: + if precision == "float16": + device_map = infer_auto_device_map( + model, + dtype="float16", + no_split_module_classes=accelerate_no_split_module_classes, + **kwargs, + ) + elif precision == "bfloat16": + device_map = infer_auto_device_map( + model, + dtype="bfloat16", + no_split_module_classes=accelerate_no_split_module_classes, + **kwargs, + ) + else: + device_map = infer_auto_device_map( + model, + no_split_module_classes=accelerate_no_split_module_classes, + **kwargs, + ) + self.log.info(f"Inferred device map {device_map}") + + # Create the LoRA config for PEFT + if lora_config: + if len(peft_target_modules) > 0: + lora_config = LoraConfig(target_modules=peft_target_modules, **lora_config) + else: + lora_config = LoraConfig(**lora_config) + self.peft_config = lora_config + # you cannot fine-tune quantized models without LoRA + if quantization and not lora_config: + lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", + } + self.peft_config = LoraConfig(target_modules=peft_target_modules, **lora_config) + if lora_config: + self.log.info(f"LoRA config: {self.peft_config}") + + # Load model and tokenizer + if quantization == 8: + # Use AutoConfig to automatically load the configuration + if self.model_name.lower() == "local": # type: ignore + self.log.info(f"Loading local model {model_class} : {self.input.get()}") + if not self.config: # type: ignore + self.config = AutoConfig.from_pretrained(os.path.join(self.input.get(), "/model")) + self.model = getattr(__import__("transformers"), str(model_class)).from_pretrained( + os.path.join(self.input.get(), "/model"), + device_map=device_map, + torch_dtype=torch_dtype, + load_in_8bit=True, + config=self.config, + **kwargs, + ) + else: + self.log.info(f"Loading from huggingface hub: {model_class} : {model_name}") + if not self.config: + self.config = AutoConfig.from_pretrained(self.model_name) + self.model = getattr(__import__("transformers"), str(model_class)).from_pretrained( + self.model_name, + revision=model_revision, + device_map=device_map, + torch_dtype=torch_dtype, + load_in_8bit=True, + config=self.config, + **kwargs, + ) + elif quantization == 4: + # Use AutoConfig to automatically load the configuration + if self.model_name.lower() == "local": # type: ignore + self.log.info(f"Loading local model {model_class} : {self.input.get()}") + if not self.config: + self.config = AutoConfig.from_pretrained(os.path.join(self.input.get(), "/model")) + self.model = getattr(__import__("transformers"), str(model_class)).from_pretrained( + os.path.join(self.input.get(), "/model"), + device_map=device_map, + torch_dtype=torch_dtype, + load_in_4bit=True, + config=self.config, + **kwargs, + ) + else: + self.log.info(f"Loading from huggingface hub: {model_class} : {model_name}") + if not self.config: + self.config = AutoConfig.from_pretrained(self.model_name) + self.model = getattr(__import__("transformers"), str(model_class)).from_pretrained( + self.model_name, + revision=model_revision, + device_map=device_map, + torch_dtype=torch_dtype, + load_in_4bit=True, + config=self.config, + **kwargs, + ) + else: + # Use AutoConfig to automatically load the configuration + if self.model_name.lower() == "local": # type: ignore + self.log.info(f"Loading local model {model_class} : {self.input.get()}") + if not self.config: + self.config = AutoConfig.from_pretrained(os.path.join(self.input.get(), "/model")) + self.model = getattr(__import__("transformers"), str(model_class)).from_pretrained( + os.path.join(self.input.get(), "/model"), + device_map=device_map, + torch_dtype=torch_dtype, + config=self.config, + **kwargs, + ) + else: + self.log.info(f"Loading from huggingface hub: {model_class} : {model_name}") + if not self.config: + self.config = AutoConfig.from_pretrained(self.model_name) + self.model = getattr(__import__("transformers"), str(model_class)).from_pretrained( + model_name, + revision=model_revision, + device_map=device_map, + torch_dtype=torch_dtype, + config=self.config, + **kwargs, + ) + + if ":" in tokenizer_name: + tokenizer_revision = tokenizer_name.split(":")[1] + tokenizer_name = tokenizer_name.split(":")[0] + else: + tokenizer_revision = None + self.tokenizer_name = tokenizer_name + self.tokenizer_revision = tokenizer_revision + + if tokenizer_name.lower() == "local": # type: ignore + self.log.info(f"Loading local tokenizer : {tokenizer_class} : {self.input.get()}") + self.tokenizer = getattr(__import__("transformers"), str(tokenizer_class)).from_pretrained( + os.path.join(self.input.get(), "/model") + ) + else: + self.log.info(f"Loading tokenizer from huggingface hub: {tokenizer_class} : {tokenizer_name}") + self.tokenizer = getattr(__import__("transformers"), str(tokenizer_class)).from_pretrained( + tokenizer_name, revision=tokenizer_revision + ) + except Exception as e: + self.log.exception(f"Failed to load model: {e}") + raise + + def upload_to_hf_hub( + self, + hf_repo_id: Optional[str] = None, + hf_commit_message: Optional[str] = None, + hf_token: Optional[str] = None, + hf_private: Optional[str] = None, + hf_create_pr: Optional[str] = None, + ): + """Upload the model and tokenizer to Hugging Face Hub.""" + try: + if self.model: + self.model.to("cpu").push_to_hub( + repo_id=hf_repo_id if hf_repo_id else self.hf_repo_id, # type: ignore + commit_message=hf_commit_message if hf_commit_message else self.hf_commit_message, # type: ignore + token=hf_token if hf_token else self.hf_token, # type: ignore + private=hf_private if hf_private else self.hf_private, # type: ignore + create_pr=hf_create_pr if hf_create_pr else self.hf_create_pr, # type: ignore + ) + if self.tokenizer: + self.tokenizer.push_to_hub( + repo_id=hf_repo_id if hf_repo_id else self.hf_repo_id, # type: ignore + commit_message=hf_commit_message if hf_commit_message else self.hf_commit_message, # type: ignore + token=hf_token if hf_token else self.hf_token, # type: ignore + private=hf_private if hf_private else self.hf_private, # type: ignore + create_pr=hf_create_pr if hf_create_pr else self.hf_create_pr, # type: ignore + ) + except Exception as e: + self.log.exception(f"Failed to upload model to huggingface hub: {e}") + raise + + def compute_metrics(self, eval_pred: EvalPrediction) -> Optional[Dict[str, float]] | Dict[str, float]: + """ + Compute metrics for evaluation. This class implements a simple classification evaluation, tasks should ideally override this. + + Args: + eval_pred (EvalPrediction): The evaluation predictions. + + Returns: + dict: The computed metrics. + """ + predictions, labels = eval_pred + predictions = predictions[0] if isinstance(predictions, tuple) else predictions + labels = labels[0] if isinstance(labels, tuple) else labels + predictions = np.argmax(predictions, axis=1) + + return { + "accuracy": accuracy_score(labels, predictions), + "precision": precision_recall_fscore_support(labels, predictions, average="binary")[0], + "recall": precision_recall_fscore_support(labels, predictions, average="binary")[1], + "f1": precision_recall_fscore_support(labels, predictions, average="binary")[2], + } + + def fine_tune( + self, + model_name: str, + tokenizer_name: str, + num_train_epochs: int, + per_device_batch_size: int, + model_class: str = "AutoModel", + tokenizer_class: str = "AutoTokenizer", + device_map: str | dict = "auto", + precision: str = "bfloat16", + quantization: Optional[int] = None, + lora_config: Optional[dict] = None, + use_accelerate: bool = False, + use_trl: bool = False, + accelerate_no_split_module_classes: List[str] = [], + compile: bool = False, + evaluate: bool = False, + save_steps: int = 500, + save_total_limit: Optional[int] = None, + load_best_model_at_end: bool = False, + metric_for_best_model: Optional[str] = None, + greater_is_better: Optional[bool] = None, + map_data: Optional[Callable] = None, + use_huggingface_dataset: bool = False, + huggingface_dataset: str = "", + hf_repo_id: Optional[str] = None, + hf_commit_message: Optional[str] = None, + hf_token: Optional[str] = None, + hf_private: bool = True, + hf_create_pr: bool = False, + notification_email: str = "", + learning_rate: float = 1e-5, + **kwargs, + ): + """ + Fine-tunes a pre-trained Hugging Face model. + + Args: + model_name (str): The name of the pre-trained model. + tokenizer_name (str): The name of the pre-trained tokenizer. + num_train_epochs (int): The total number of training epochs to perform. + per_device_batch_size (int): The batch size per device during training. + model_class (str, optional): The model class to use. Defaults to "AutoModel". + tokenizer_class (str, optional): The tokenizer class to use. Defaults to "AutoTokenizer". + device_map (str | dict, optional): The device map for distributed training. Defaults to "auto". + precision (str, optional): The precision to use for training. Defaults to "bfloat16". + quantization (int, optional): The quantization level to use for training. Defaults to None. + lora_config (dict, optional): Configuration for PEFT LoRA optimization. Defaults to None. + use_accelerate (bool, optional): Whether to use accelerate for distributed training. Defaults to False. + use_trl (bool, optional): Whether to use TRL for training. Defaults to False. + accelerate_no_split_module_classes (List[str], optional): The module classes to not split during distributed training. Defaults to []. + evaluate (bool, optional): Whether to evaluate the model after training. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + save_steps (int, optional): Number of steps between checkpoints. Defaults to 500. + save_total_limit (Optional[int], optional): Maximum number of checkpoints to keep. Older checkpoints are deleted. Defaults to None. + load_best_model_at_end (bool, optional): Whether to load the best model (according to evaluation) at the end of training. Defaults to False. + metric_for_best_model (Optional[str], optional): The metric to use to compare models. Defaults to None. + greater_is_better (Optional[bool], optional): Whether a larger value of the metric indicates a better model. Defaults to None. + use_huggingface_dataset (bool, optional): Whether to load a dataset from huggingface hub. + huggingface_dataset (str, optional): The huggingface dataset to use. + map_data (Callable, optional): A function to map data before training. Defaults to None. + hf_repo_id (str, optional): The Hugging Face repo ID. Defaults to None. + hf_commit_message (str, optional): The Hugging Face commit message. Defaults to None. + hf_token (str, optional): The Hugging Face token. Defaults to None. + hf_private (bool, optional): Whether to make the repo private. Defaults to True. + hf_create_pr (bool, optional): Whether to create a pull request. Defaults to False. + notification_email (str, optional): Whether to notify after job is complete. Defaults to None. + learning_rate (float, optional): Learning rate for backpropagation. + **kwargs: Additional keyword arguments to pass to the model. + + Returns: + None + """ + try: + # Save everything + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.num_train_epochs = num_train_epochs + self.per_device_batch_size = per_device_batch_size + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.device_map = device_map + self.precision = precision + self.quantization = quantization + self.lora_config = lora_config # type: ignore + self.use_accelerate = use_accelerate + self.use_trl = use_trl + self.accelerate_no_split_module_classes = accelerate_no_split_module_classes + self.evaluate = evaluate + self.use_huggingface_dataset = use_huggingface_dataset + self.huggingface_dataset = huggingface_dataset + self.hf_repo_id = hf_repo_id + self.hf_commit_message = hf_commit_message + self.hf_token = hf_token + self.hf_private = hf_private + self.hf_create_pr = hf_create_pr + self.map_data = map_data + self.notification_email = notification_email + self.learning_rate = learning_rate + self.config = None + + model_kwargs = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_kwargs = model_kwargs + + self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + device_map=self.device_map, + precision=self.precision, + quantization=self.quantization, + lora_config=self.lora_config, + use_accelerate=self.use_accelerate, + accelerate_no_split_module_classes=self.accelerate_no_split_module_classes, + **model_kwargs, + ) + + if self.tokenizer and not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + self.model.config.pad_token_id = self.tokenizer.eos_token_id + + # Load dataset + dataset_kwargs = {k.replace("data_", ""): v for k, v in kwargs.items() if "data_" in k} + self.dataset_kwargs = dataset_kwargs + self.preprocess_data(**dataset_kwargs) + + # Separate training and evaluation arguments + trainer_kwargs = {k.replace("trainer_", ""): v for k, v in kwargs.items() if "trainer_" in k} + self.trainer_kwargs = trainer_kwargs + training_kwargs = {k.replace("training_", ""): v for k, v in kwargs.items() if "training_" in k} + self.training_kwargs = training_kwargs + + # Create training arguments + training_args = TrainingArguments( + output_dir=os.path.join(self.output.output_folder, "model"), + num_train_epochs=num_train_epochs, + per_device_train_batch_size=per_device_batch_size, + per_device_eval_batch_size=per_device_batch_size, + save_steps=save_steps, + save_total_limit=save_total_limit, + load_best_model_at_end=load_best_model_at_end, + metric_for_best_model=metric_for_best_model, + greater_is_better=greater_is_better, + dataloader_num_workers=4, + learning_rate=self.learning_rate, + **training_kwargs, + ) + + # Add adapters to the model for fine-tuning + if self.lora_config and not use_trl: + self.model.enable_input_require_grads() + self.model = get_peft_model(self.model, peft_config=self.peft_config) + + if compile: + self.model = torch.compile(self.model) + + # Create trainer + if use_trl: + self.model = get_peft_model(self.model, peft_config=self.peft_config) + trainer = SFTTrainer( + model=self.model, + args=training_args, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + tokenizer=self.tokenizer, + compute_metrics=self.compute_metrics, + data_collator=self.data_collator if hasattr(self, "data_collator") else None, + peft_config=self.peft_config, + **trainer_kwargs, + ) + else: + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + tokenizer=self.tokenizer, + compute_metrics=self.compute_metrics, + data_collator=self.data_collator if hasattr(self, "data_collator") else None, + **trainer_kwargs, + ) + + # Train the model + trainer.train() + trainer.save_model(self.output.output_folder) + + if self.evaluate: + eval_result = trainer.evaluate() + self.log.info(f"Evaluation results: {eval_result}") + + # Save the model configuration to Hugging Face Hub if hf_repo_id is not None + if self.hf_repo_id and self.config: + self.config.save_pretrained(os.path.join(self.output.output_folder, "model")) + self.upload_to_hf_hub() + except Exception as e: + self.log.exception(f"Failed to fine tune model: {e}") + self.state.set_state(self.id, {"success": False, "exception": str(e)}) + raise + self.state.set_state(self.id, {"success": True}) + + self.done() + + def done(self): + if self.notification_email: + self.output.flush() + send_fine_tuning_email( + recipient=self.notification_email, bucket_name=self.output.bucket, prefix=self.output.s3_folder + ) diff --git a/geniusrise_text/base/tests/test_api.py b/geniusrise_text/base/tests/test_api.py new file mode 100644 index 0000000..77d1dd3 --- /dev/null +++ b/geniusrise_text/base/tests/test_api.py @@ -0,0 +1,211 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import os + +import pytest +import torch + +# import transformers +from geniusrise.core import BatchInput, BatchOutput, InMemoryState + +from geniusrise_text.base.api import TextAPI + + +@pytest.fixture( + params=[ + # model_name, model_class, tokenizer_class, use_cuda, precision, quantization, device_map, max_memory, torchscript + # fmt: off + ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", 0, None, None, False), + ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", False, "float32", 0, None, None, False), + ("bigscience/bloom-560m", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, None, None, False), + ("meta-llama/Llama-2-7b-hf", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, None, None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 8, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, "auto", None, True), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, "auto", None, True), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 8, "auto", None, True), + ("TheBloke/Mistral-7B-v0.1-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", None, "cuda:0", None, False), + # fmt: on + ] +) +def model_config(request): + return request.param + + +# Fixtures to initialize TextAPI instance +@pytest.fixture +def hfa(): + input_dir = "./input_dir" + output_dir = "./output_dir" + + input = BatchInput(input_dir, "geniusrise-test", "api_input") + output = BatchOutput(output_dir, "geniusrise-test", "api_output") + state = InMemoryState() + + hfa = TextAPI( + input=input, + output=output, + state=state, + ) + yield hfa # provide the fixture value + + # cleanup + if os.path.exists(input_dir): + os.rmdir(input_dir) + if os.path.exists(output_dir): + os.rmdir(output_dir) + + +def test_load_models(hfa, model_config): + ( + model_name, + model_class, + tokenizer_class, + use_cuda, + precision, + quantization, + device_map, + max_memory, + torchscript, + ) = model_config + + if ":" in model_name: + _model_name = model_name + model_revision = _model_name.split(":")[1] + model_name = _model_name.split(":")[0] + tokenizer_revision = _model_name.split(":")[1] + tokenizer_name = _model_name.split(":")[0] + else: + model_revision = None + tokenizer_revision = None + + model, tokenizer = hfa.load_models( + model_name=model_name, + model_revision=model_revision, + tokenizer_name=model_name, + tokenizer_revision=tokenizer_revision, + model_class=model_class, + tokenizer_class=tokenizer_class, + use_cuda=use_cuda, + precision=precision, + quantization=quantization, + device_map=device_map, + max_memory=max_memory, + torchscript=torchscript, + ) + assert model is not None + assert tokenizer is not None + assert len(list(model.named_modules())) > 0 + + del model + del tokenizer + torch.cuda.empty_cache() + + +# Define strategies and associated parameters +strategies = { + "generate": {}, + "greedy_search": {}, + "beam_search": {"num_beams": 4}, + "beam_sample": {"num_beams": 4, "temperature": 0.7, "top_k": 20}, + "group_beam_search": {"num_beams": 4, "num_beam_groups": 2}, +} + +# Define other parameters +length_params = { + "max_length": [20, 30], + "min_length": [0, 10], + "early_stopping": [False, True], +} +gen_strategy_params = { + "do_sample": [False, True], +} +logit_params = { + "temperature": [1.0, 0.7], + "top_k": [50, 20], + "top_p": [1.0, 0.9], + "repetition_penalty": [1.0, 1.5], + "length_penalty": [1.0, 0.5], + "no_repeat_ngram_size": [0, 2], +} +# Merge all the parameters into one dictionary for itertools.product +all_params = {**length_params, **gen_strategy_params, **logit_params} + + +@pytest.mark.parametrize("strategy", list(strategies.keys())) +def test_generate_strategies(hfa, model_config, strategy): + ( + model_name, + model_class, + tokenizer_class, + use_cuda, + precision, + quantization, + device_map, + max_memory, + torchscript, + ) = model_config + + if ":" in model_name: + _model_name = model_name + model_revision = _model_name.split(":")[1] + model_name = _model_name.split(":")[0] + tokenizer_revision = _model_name.split(":")[1] + tokenizer_name = _model_name.split(":")[0] + else: + model_revision = None + tokenizer_revision = None + + model, tokenizer = hfa.load_models( + model_name=model_name, + model_revision=model_revision, + tokenizer_name=model_name, + tokenizer_revision=tokenizer_revision, + model_class=model_class, + tokenizer_class=tokenizer_class, + use_cuda=use_cuda, + precision=precision, + quantization=quantization, + device_map=device_map, + max_memory=max_memory, + torchscript=torchscript, + ) + hfa.model = model + hfa.tokenizer = tokenizer + + # Strategy-specific params + strategy_params = strategies[strategy] + + # All possible combinations for the current strategy + param_combinations = [ + {**dict(zip(all_params.keys(), values)), **strategy_params} + for values in itertools.product(*all_params.values()) + ] + + for param_set in param_combinations: + generated_text = hfa.generate( + prompt="Once upon a time", decoding_strategy=strategy, **param_set # Unpack params into function arguments + ) + assert generated_text is not None + assert isinstance(generated_text, str) + break + + # Cleanup + del model + del tokenizer + torch.cuda.empty_cache() diff --git a/geniusrise_text/base/tests/test_bulk.py b/geniusrise_text/base/tests/test_bulk.py new file mode 100644 index 0000000..039ca3d --- /dev/null +++ b/geniusrise_text/base/tests/test_bulk.py @@ -0,0 +1,211 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import os + +import pytest +import torch + +# import transformers +from geniusrise.core import BatchInput, BatchOutput, InMemoryState + +from geniusrise_text.base.bulk import TextBulk + + +@pytest.fixture( + params=[ + # model_name, model_class, tokenizer_class, use_cuda, precision, quantization, device_map, max_memory, torchscript + # fmt: off + ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", 0, None, None, False), + ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", False, "float32", 0, None, None, False), + ("bigscience/bloom-560m", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, None, None, False), + ("meta-llama/Llama-2-7b-hf", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, None, None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 8, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, "auto", None, True), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, "auto", None, True), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 8, "auto", None, True), + ("TheBloke/Mistral-7B-v0.1-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", None, "cuda:0", None, False), + # fmt: on + ] +) +def model_config(request): + return request.param + + +# Fixtures to initialize TextBulk instance +@pytest.fixture +def hfa(): + input_dir = "./input_dir" + output_dir = "./output_dir" + + input = BatchInput(input_dir, "geniusrise-test", "api_input") + output = BatchOutput(output_dir, "geniusrise-test", "api_output") + state = InMemoryState() + + hfa = TextBulk( + input=input, + output=output, + state=state, + ) + yield hfa # provide the fixture value + + # cleanup + if os.path.exists(input_dir): + os.rmdir(input_dir) + if os.path.exists(output_dir): + os.rmdir(output_dir) + + +def test_load_models(hfa, model_config): + ( + model_name, + model_class, + tokenizer_class, + use_cuda, + precision, + quantization, + device_map, + max_memory, + torchscript, + ) = model_config + + if ":" in model_name: + _model_name = model_name + model_revision = _model_name.split(":")[1] + model_name = _model_name.split(":")[0] + tokenizer_revision = _model_name.split(":")[1] + tokenizer_name = _model_name.split(":")[0] + else: + model_revision = None + tokenizer_revision = None + + model, tokenizer = hfa.load_models( + model_name=model_name, + model_revision=model_revision, + tokenizer_name=model_name, + tokenizer_revision=tokenizer_revision, + model_class=model_class, + tokenizer_class=tokenizer_class, + use_cuda=use_cuda, + precision=precision, + quantization=quantization, + device_map=device_map, + max_memory=max_memory, + torchscript=torchscript, + ) + assert model is not None + assert tokenizer is not None + assert len(list(model.named_modules())) > 0 + + del model + del tokenizer + torch.cuda.empty_cache() + + +# Define strategies and associated parameters +strategies = { + "generate": {}, + "greedy_search": {}, + "beam_search": {"num_beams": 4}, + "beam_sample": {"num_beams": 4, "temperature": 0.7, "top_k": 20}, + "group_beam_search": {"num_beams": 4, "num_beam_groups": 2}, +} + +# Define other parameters +length_params = { + "max_length": [20, 30], + "min_length": [0, 10], + "early_stopping": [False, True], +} +gen_strategy_params = { + "do_sample": [False, True], +} +logit_params = { + "temperature": [1.0, 0.7], + "top_k": [50, 20], + "top_p": [1.0, 0.9], + "repetition_penalty": [1.0, 1.5], + "length_penalty": [1.0, 0.5], + "no_repeat_ngram_size": [0, 2], +} +# Merge all the parameters into one dictionary for itertools.product +all_params = {**length_params, **gen_strategy_params, **logit_params} + + +@pytest.mark.parametrize("strategy", list(strategies.keys())) +def test_generate_strategies(hfa, model_config, strategy): + ( + model_name, + model_class, + tokenizer_class, + use_cuda, + precision, + quantization, + device_map, + max_memory, + torchscript, + ) = model_config + + if ":" in model_name: + _model_name = model_name + model_revision = _model_name.split(":")[1] + model_name = _model_name.split(":")[0] + tokenizer_revision = _model_name.split(":")[1] + tokenizer_name = _model_name.split(":")[0] + else: + model_revision = None + tokenizer_revision = None + + model, tokenizer = hfa.load_models( + model_name=model_name, + model_revision=model_revision, + tokenizer_name=model_name, + tokenizer_revision=tokenizer_revision, + model_class=model_class, + tokenizer_class=tokenizer_class, + use_cuda=use_cuda, + precision=precision, + quantization=quantization, + device_map=device_map, + max_memory=max_memory, + torchscript=torchscript, + ) + hfa.model = model + hfa.tokenizer = tokenizer + + # Strategy-specific params + strategy_params = strategies[strategy] + + # All possible combinations for the current strategy + param_combinations = [ + {**dict(zip(all_params.keys(), values)), **strategy_params} + for values in itertools.product(*all_params.values()) + ] + + for param_set in param_combinations: + generated_text = hfa.generate( + prompt="Once upon a time", decoding_strategy=strategy, **param_set # Unpack params into function arguments + ) + assert generated_text is not None + assert isinstance(generated_text, str) + break + + # Cleanup + del model + del tokenizer + torch.cuda.empty_cache() diff --git a/geniusrise_text/base/tests/test_fine_tune.py b/geniusrise_text/base/tests/test_fine_tune.py new file mode 100644 index 0000000..4ea1175 --- /dev/null +++ b/geniusrise_text/base/tests/test_fine_tune.py @@ -0,0 +1,257 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import numpy as np +import pytest +import torch +from datasets import load_dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from transformers import DataCollatorForLanguageModeling, EvalPrediction + +from geniusrise_text.base import TextFineTuner + +# SEQ_CLS = "SEQ_CLS" +# SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM" +# CAUSAL_LM = "CAUSAL_LM" +# TOKEN_CLS = "TOKEN_CLS" +# QUESTION_ANS = "QUESTION_ANS" +# FEATURE_EXTRACTION = "FEATURE_EXTRACTION" + +lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", +} + + +class TestTextFineTuner(TextFineTuner): + def load_dataset(self, dataset_path, **kwargs): + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]") # Adjust the split as needed + dataset = dataset.map( + lambda examples: self.tokenizer( + examples["text"], + truncation=True, + padding="max_length", + max_length=512, + ), + batched=True, + ) + return dataset + + def data_collator(self, examples): + return DataCollatorForLanguageModeling(self.tokenizer, mlm=False)(examples) + + +@pytest.fixture +def bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + + return TestTextFineTuner( + input=input, + output=output, + state=state, + evaluate=False, + ) + + +def test_bolt_init(bolt): + assert bolt.input is not None + assert bolt.output is not None + assert bolt.state is not None + + +def test_load_dataset(bolt): + bolt.model_name = "bert-base-uncased" + bolt.tokenizer_name = "bert-base-uncased" + bolt.model_class = "AutoModelForCausalLM" + bolt.tokenizer_class = "BertTokenizer" + bolt.load_models( + model_name=bolt.model_name, + tokenizer_name=bolt.tokenizer_name, + model_class=bolt.model_class, + tokenizer_class=bolt.tokenizer_class, + device_map=None, + ) + dataset = bolt.load_dataset("fake_path") + assert dataset is not None + assert len(dataset) >= 100 + + del bolt.model + del bolt.tokenizer + torch.cuda.empty_cache() + + +def test_fine_tune(bolt): + bolt.fine_tune( + model_name="bert-base-uncased", + tokenizer_name="bert-base-uncased", + num_train_epochs=1, + per_device_batch_size=2, + model_class="AutoModelForCausalLM", + tokenizer_class="BertTokenizer", + evaluate=False, + device_map=None, + ) + bolt.upload_to_hf_hub( + hf_repo_id="ixaxaar/geniusrise-hf-base-test-repo", + hf_commit_message="testing base fine tuner", + hf_token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"), + hf_private=False, + hf_create_pr=True, + ) + + # Check that model files are created in the output directory + assert os.path.isfile(os.path.join(bolt.output.output_folder, "model", "pytorch_model.bin")) + assert os.path.isfile(os.path.join(bolt.output.output_folder, "model", "config.json")) + assert os.path.isfile(os.path.join(bolt.output.output_folder, "model", "training_args.bin")) + + del bolt.model + del bolt.tokenizer + torch.cuda.empty_cache() + + +def test_compute_metrics(bolt): + # Mocking an EvalPrediction object + logits = np.array([[0.6, 0.4], [0.4, 0.6]]) + labels = np.array([0, 1]) + eval_pred = EvalPrediction(predictions=logits, label_ids=labels) + + metrics = bolt.compute_metrics(eval_pred) + + assert "accuracy" in metrics + assert "precision" in metrics + assert "recall" in metrics + assert "f1" in metrics + + +models = { + "small": "bigscience/bloom-560m", + "medium": "meta-llama/Llama-2-7b-hf", + "large": "mistralai/Mistral-7B-v0.1", + "4-bit": "TheBloke/Mistral-7B-v0.1-GPTQ:gptq-4bit-32g-actorder_True", + "8-bit": "TheBloke/OpenHermes-2-Mistral-7B-GPTQ:gptq-8bit-128g-actorder_True", +} + + +@pytest.mark.parametrize( + "model, precision, quantization, lora_config, use_accelerate", + [ + # small + (models["small"], "float16", None, None, False), + (models["small"], "float16", None, None, True), + (models["small"], "float16", None, lora_config, False), + (models["small"], "float16", None, lora_config, True), + (models["small"], "float32", None, None, False), + (models["small"], "float32", None, None, True), + (models["small"], "float32", None, lora_config, False), + (models["small"], "float32", None, lora_config, True), + (models["small"], "bfloat16", None, None, False), + (models["small"], "bfloat16", None, None, True), + (models["small"], "bfloat16", None, lora_config, False), + (models["small"], "bfloat16", None, lora_config, True), + # small - 4bit + (models["small"], "float16", 4, lora_config, False), + (models["small"], "float16", 4, lora_config, True), + (models["small"], "float32", 4, lora_config, False), + (models["small"], "float32", 4, lora_config, True), + (models["small"], "bfloat16", 4, lora_config, False), + (models["small"], "bfloat16", 4, lora_config, True), + # small - 8 bit + (models["small"], "float16", 8, lora_config, False), + (models["small"], "float16", 8, lora_config, True), + (models["small"], "float32", 8, lora_config, False), + (models["small"], "float32", 8, lora_config, True), + (models["small"], "bfloat16", 8, lora_config, False), + (models["small"], "bfloat16", 8, lora_config, True), + # large + (models["large"], "bfloat16", 4, lora_config, False), + (models["large"], "bfloat16", 4, lora_config, True), + (models["large"], "float16", 4, lora_config, False), + (models["large"], "float16", 4, lora_config, True), + (models["large"], "float32", 4, lora_config, False), + (models["large"], "float32", 4, lora_config, True), + # 4 bit + (models["4-bit"], "float16", None, lora_config, False), + # 8 bit + (models["8-bit"], "float16", None, lora_config, False), + (models["8-bit"], "float16", None, lora_config, True), + ], +) +def test_fine_tune_options(bolt, model, precision, quantization, lora_config, use_accelerate): + use_trl = False + + if use_trl: + bolt.fine_tune( + model_name=model, + tokenizer_name=model, + model_class="AutoModelForCausalLM", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map="auto" if "GPTQ" in model else None, + trainer_packing=False if lora_config is not None else None, + trainer_dataset_text_field="text" if lora_config is not None else None, + ) + else: + bolt.fine_tune( + model_name=model, + tokenizer_name=model, + model_class="AutoModelForCausalLM", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map="auto" if "GPTQ" in model else None, + ) + + # Verify the model has been fine-tuned by checking the existence of model files + assert os.path.exists(os.path.join(bolt.output.output_folder, "model", "pytorch_model.bin")) or os.path.exists( + os.path.join(bolt.output.output_folder, "model", "adapter_model.bin") + ) + assert os.path.exists(os.path.join(bolt.output.output_folder, "model", "config.json")) or os.path.exists( + os.path.join(bolt.output.output_folder, "model", "adapter_config.json") + ) + assert os.path.exists(os.path.join(bolt.output.output_folder, "model", "training_args.bin")) + + # Clear the output directory for the next test + try: + os.remove(os.path.join(bolt.output.output_folder, "model", "pytorch_model.bin")) + os.remove(os.path.join(bolt.output.output_folder, "model", "adapter_model.bin")) + os.remove(os.path.join(bolt.output.output_folder, "model", "config.json")) + os.remove(os.path.join(bolt.output.output_folder, "model", "adapter_config.json")) + os.remove(os.path.join(bolt.output.output_folder, "model", "training_args.bin")) + except Exception as _: + pass + + del bolt.model + del bolt.tokenizer + torch.cuda.empty_cache() diff --git a/geniusrise_text/base/util.py b/geniusrise_text/base/util.py new file mode 100644 index 0000000..19ccd62 --- /dev/null +++ b/geniusrise_text/base/util.py @@ -0,0 +1,46 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = { + "t5": ["q", "v"], + "mt5": ["q", "v"], + "bart": ["q_proj", "v_proj"], + "gpt2": ["c_attn"], + "bloom": ["query_key_value"], + "blip-2": ["q", "v", "q_proj", "v_proj"], + "opt": ["q_proj", "v_proj"], + "gptj": ["q_proj", "v_proj"], + "gpt_neox": ["query_key_value"], + "gpt_neo": ["q_proj", "v_proj"], + "bert": ["query", "value"], + "roberta": ["query", "value"], + "xlm-roberta": ["query", "value"], + "electra": ["query", "value"], + "deberta-v2": ["query_proj", "value_proj"], + "deberta": ["in_proj"], + "layoutlm": ["query", "value"], + "llama": ["q_proj", "v_proj"], + "chatglm": ["query_key_value"], + "gpt_bigcode": ["c_attn"], + "mpt": ["Wqkv"], + "RefinedWebModel": ["query_key_value"], + "RefinedWeb": ["query_key_value"], + "falcon": ["query_key_value"], + "btlm": ["c_proj", "c_attn"], + "codegen": ["qkv_proj"], + "mistral": ["q_proj", "v_proj"], + "stablelm": ["q_proj", "v_proj"], + "phi": ["Wqkv", "out_proj", "fc1", "fc2"], +} diff --git a/geniusrise_text/classification/__init__.py b/geniusrise_text/classification/__init__.py new file mode 100644 index 0000000..98390d1 --- /dev/null +++ b/geniusrise_text/classification/__init__.py @@ -0,0 +1,18 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import TextClassificationAPI +from .bulk import TextClassificationBulk +from .fine_tune import TextClassificationFineTuner diff --git a/geniusrise_text/classification/api.py b/geniusrise_text/classification/api.py new file mode 100644 index 0000000..46f5348 --- /dev/null +++ b/geniusrise_text/classification/api.py @@ -0,0 +1,183 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Any, Dict + +import cherrypy +import numpy as np +import torch +from geniusrise import BatchInput, BatchOutput, State +from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline + +from geniusrise_text.base import TextAPI + +log = logging.getLogger(__file__) + + +class TextClassificationAPI(TextAPI): + r""" + TextClassificationAPI leveraging Hugging Face's transformers for text classification tasks. + This API provides an interface to classify text into various categories like sentiment, topic, intent, etc. + + Attributes: + model (AutoModelForSequenceClassification): A Hugging Face model for sequence classification. + tokenizer (AutoTokenizer): A tokenizer for preprocessing text. + hf_pipeline (Pipeline): A Hugging Face pipeline for text classification. + + Methods: + classify(self): Classifies text using the model and tokenizer. + classification_pipeline(self): Classifies text using the Hugging Face pipeline. + initialize_pipeline(self): Lazy initialization of the classification pipeline. + + Example CLI Usage: + ```bash + genius TextClassificationAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id cardiffnlp/twitter-roberta-base-hate-multiclass-latest-lol \ + listen \ + --args \ + model_name="cardiffnlp/twitter-roberta-base-hate-multiclass-latest" \ + model_class="AutoModelForSequenceClassification" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + """ + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs, + ) -> None: + """ + Initializes the TextClassificationAPI with the necessary configurations for input, output, and state management. + + Args: + input (BatchInput): Configuration for the input data. + output (BatchOutput): Configuration for the output data. + state (State): State management for the API. + **kwargs: Additional keyword arguments for extended functionality. + """ + super().__init__(input=input, output=output, state=state) + log.info("Loading Hugging Face API server") + self.hf_pipeline = None + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def classify(self) -> Dict[str, Any]: + """ + Accepts text input and returns classification results. The method uses the model and tokenizer to classify the text + and provide the likelihood of each class label. + + Returns: + Dict[str, Any]: A dictionary containing the original input text and the classification scores for each label. + + Example CURL Request for text classification: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/classify \ + -H "Content-Type: application/json" \ + -d '{ + "text": "tata sons lost a major contract to its rival mahindra motors" + }' | jq + ``` + """ + data: Dict[str, str] = cherrypy.request.json + text = data.get("text", "") + + inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model(**inputs) + logits = outputs.logits if hasattr(outputs, "logits") else outputs[0] + if next(self.model.parameters()).is_cuda: + logits = logits.cpu() + + # Handling a single number output + if logits.numel() == 1: + logits = outputs.logits.cpu().detach().numpy() + scores = 1 / (1 + np.exp(-logits)).flatten() + return {"input": text, "label_scores": scores.tolist()} + else: + softmax = torch.nn.functional.softmax(logits, dim=-1) + scores = softmax.numpy().tolist() + + id_to_label = dict(enumerate(self.model.config.id2label.values())) # type: ignore + label_scores = {id_to_label[label_id]: score for label_id, score in enumerate(scores[0])} + + return {"input": text, "label_scores": label_scores} + + def initialize_pipeline(self): + """ + Lazy initialization of the Hugging Face pipeline for classification. + """ + if not self.hf_pipeline: + model = AutoModelForSequenceClassification.from_pretrained(self.model_name) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if self.use_cuda: + model.cuda() + self.hf_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def classification_pipeline(self) -> Dict[str, Any]: + """ + Accepts text input and returns classification results using the Hugging Face pipeline. + + This method uses the Hugging Face pipeline for efficient and robust text classification. It's suitable for various + classification tasks such as sentiment analysis, topic classification, and intent recognition. + + Args: + None - Expects input through the POST request's JSON body. + + Returns: + Dict[str, Any]: A dictionary containing the original input text and the classification results. + + Example CURL Request for text classification: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/classification_pipeline \ + -H "Content-Type: application/json" \ + -d '{"text": "The movie was fantastic, with great acting and plot."}' | jq + ``` + """ + data: Dict[str, str] = cherrypy.request.json + text = data.get("text", "") + + self.initialize_pipeline() + result = self.hf_pipeline(text) # type: ignore + + return {"input": text, "result": result} diff --git a/geniusrise_text/classification/api.yml b/geniusrise_text/classification/api.yml new file mode 100644 index 0000000..f243808 --- /dev/null +++ b/geniusrise_text/classification/api.yml @@ -0,0 +1,88 @@ +openapi: 3.0.0 +info: + title: GeniusRise Text Processing API + description: API for text generation and classification using Hugging Face models. + version: "1.0" +servers: + - url: http://localhost:3000/api/v1 + description: Development server +paths: + + /classify: + post: + summary: Classifies text into categories + operationId: classifyText + tags: + - Text Classification + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The text to classify + required: + - text + responses: + 200: + description: Successfully classified text + content: + application/json: + schema: + type: object + properties: + input: + type: string + label_scores: + type: object + additionalProperties: + type: number + 400: + description: Invalid request + 500: + description: Error during text classification + + /classification_pipeline: + post: + summary: Classifies text using the Hugging Face pipeline + operationId: classifyTextPipeline + tags: + - Text Classification + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The text to classify + required: + - text + responses: + 200: + description: Successfully classified text using pipeline + content: + application/json: + schema: + type: object + properties: + input: + type: string + result: + type: array + items: + type: object + properties: + label: + type: string + score: + type: number + 400: + description: Invalid request + 500: + description: Error during text classification diff --git a/geniusrise_text/classification/bulk.py b/geniusrise_text/classification/bulk.py new file mode 100644 index 0000000..0152b47 --- /dev/null +++ b/geniusrise_text/classification/bulk.py @@ -0,0 +1,363 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional + +import pandas as pd +import torch +import yaml # type: ignore +from datasets import Dataset, load_from_disk +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.base import TextBulk + + +class TextClassificationBulk(TextBulk): + r""" + TextClassificationBulk is designed to handle bulk text classification tasks using Hugging Face models efficiently and + effectively. It allows for processing large datasets, utilizing state-of-the-art machine learning models to provide + accurate classification of text data into predefined labels. + + Args: + input (BatchInput): Configuration and data inputs for the batch process. + output (BatchOutput): Configurations for output data handling. + state (State): State management for the classification task. + **kwargs: Arbitrary keyword arguments for extended configurations. + + Example CLI Usage: + ```bash + genius TextClassificationBulk rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id cardiffnlp/twitter-roberta-base-hate-multiclass-latest-lol \ + classify \ + --args \ + model_name="cardiffnlp/twitter-roberta-base-hate-multiclass-latest" \ + model_class="AutoModelForSequenceClassification" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="bfloat16" \ + quantization=0 \ + device_map="auto" \ + max_memory=None \ + torchscript=False + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + """ + Initializes the TextClassificationBulk class with input, output, and state configurations. + + Args: + input (BatchInput): Configuration for the input data. + output (BatchOutput): Configuration for the output data. + state (State): State management for the classification task. + **kwargs: Additional keyword arguments for extended functionality. + """ + super().__init__(input, output, state, **kwargs) + + def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs) -> Optional[Dataset]: + r""" + Load a classification dataset from a directory. + + Args: + dataset_path (str): The path to the dataset directory. + max_length (int, optional): The maximum length for tokenization. Defaults to 512. + + Returns: + Dataset: The loaded dataset. + + Raises: + Exception: If there was an error loading the dataset. + + ## Supported Data Formats and Structures: + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"text": "The text content"} + ``` + + ### CSV + Should contain 'text' columns. + ```csv + text + "The text content" + ``` + + ### Parquet + Should contain 'text' columns. + + ### JSON + An array of dictionaries with 'text' keys. + ```json + [{"text": "The text content"}] + ``` + + ### XML + Each 'record' element should contain 'text' child elements. + ```xml + + The text content + + ``` + + ### YAML + Each document should be a dictionary with 'text' keys. + ```yaml + - text: "The text content" + ``` + + ### TSV + Should contain 'text' columns separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'text' columns. + + ### SQLite (.db) + Should contain a table with 'text' columns. + + ### Feather + Should contain 'text' columns. + """ + self.max_length = max_length + + self.label_to_id = self.model.config.label2id if self.model and self.model.config.label2id else {} # type: ignore + + try: + self.log.info(f"Loading dataset from {dataset_path}") + if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + # Load dataset saved by Hugging Face datasets library + return load_from_disk(dataset_path) + else: + data = [] + for filename in glob.glob(f"{dataset_path}/**/*", recursive=True): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + text = record.find("text").text # type: ignore + data.append({"text": text}) + + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT text FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + data = [fn(d) for d in data] + else: + data = data + + return Dataset.from_pandas(pd.DataFrame(data)) + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def classify( + self, + model_name: str, + model_class: str = "AutoModelForSequenceClassification", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + batch_size: int = 32, + notification_email: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Perform bulk classification using the specified model and tokenizer. This method handles the entire classification + process including loading the model, processing input data, predicting classifications, and saving the results. + + Args: + model_name (str): Name or path of the model. + model_class (str): Class name of the model (default "AutoModelForSequenceClassification"). + tokenizer_class (str): Class name of the tokenizer (default "AutoTokenizer"). + use_cuda (bool): Whether to use CUDA for model inference (default False). + precision (str): Precision for model computation (default "float"). + quantization (int): Level of quantization for optimizing model size and speed (default 0). + device_map (str | Dict | None): Specific device to use for computation (default "auto"). + max_memory (Dict): Maximum memory configuration for devices. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool): Whether to enable AWQ optimization (default False). + flash_attention (bool): Whether to use flash attention optimization (default False). + batch_size (int): Number of classifications to process simultaneously (default 32). + **kwargs: Arbitrary keyword arguments for model and generation configurations. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.batch_size = batch_size + self.notification_email = notification_email + self.compile = compile + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["text"] + + # Process data in batches + for i in range(0, len(dataset), batch_size): + batch = dataset[i : i + batch_size] + inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True) + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + predictions = self.model(**inputs) + predictions = predictions[0] if isinstance(predictions, tuple) else predictions.logits + predictions = torch.argmax(predictions, dim=-1).cpu().numpy() + + self._save_predictions(predictions, batch, output_path, i) + self.done() + + def _save_predictions( + self, + predictions: torch.Tensor, + input_batch: List[str], + output_path: str, + batch_idx: int, + ) -> None: + """ + Saves the classification predictions to a specified output path. This method is called internally by the classify method + to persist the classification results. + + Args: + predictions (torch.Tensor): Tensor of label indices predicted by the model. + input_batch (List[str]): List of original texts that were classified. + output_path (str): Path to save the classification results. + batch_idx (int): Index of the current batch (for naming files). + """ + id_to_label = dict(enumerate(self.model.config.id2label.values())) # type: ignore + label_predictions = [id_to_label[label_id] for label_id in predictions.tolist()] + + # Prepare data for saving + data_to_save = [ + {"input": input_text, "prediction": label} for input_text, label in zip(input_batch, label_predictions) + ] + with open( + os.path.join(output_path, f"predictions-{batch_idx}-{str(uuid.uuid4())}.json"), + "w", + ) as f: + json.dump(data_to_save, f) diff --git a/geniusrise_text/classification/eval.py b/geniusrise_text/classification/eval.py new file mode 100644 index 0000000..1ecde3b --- /dev/null +++ b/geniusrise_text/classification/eval.py @@ -0,0 +1,412 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional + +import pandas as pd +import torch +import yaml # type: ignore +from datasets import Dataset, load_from_disk, load_dataset, load_metric +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.base import TextBulk + + +class TextClassificationEval(TextBulk): + r""" + TextClassificationEval extends TextBulk to support evaluation of text classification models on large datasets. It facilitates + processing of datasets, model inference, and computation of evaluation metrics such as accuracy, precision, recall, and F1 score. + + Args: + input (BatchInput): Configuration and data inputs for the batch process. + output (BatchOutput): Configurations for output data handling. + state (State): State management for the classification task. + **kwargs: Arbitrary keyword arguments for extended configurations. + + Example CLI Usage: + ```bash + genius TextClassificationEval rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id cardiffnlp/twitter-roberta-base-hate-multiclass-latest-lol \ + classify \ + --args \ + model_name="cardiffnlp/twitter-roberta-base-hate-multiclass-latest" \ + model_class="AutoModelForSequenceClassification" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="bfloat16" \ + quantization=0 \ + device_map="auto" \ + max_memory=None \ + torchscript=False + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + """ + Initializes the TextClassificationEval class with configurations for input, output, state, and evaluation settings. + + Args: + input (BatchInput): Configuration for the input data. + output (BatchOutput): Configuration for the output data. + state (State): State management for the classification task. + **kwargs: Additional keyword arguments for extended functionality. + """ + super().__init__(input, output, state, **kwargs) + + def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs) -> Optional[Dataset]: + r""" + Load a classification dataset from a directory. + + Args: + dataset_path (str): The path to the dataset directory. + max_length (int, optional): The maximum length for tokenization. Defaults to 512. + + Returns: + Dataset: The loaded dataset. + + Raises: + Exception: If there was an error loading the dataset. + + ## Supported Data Formats and Structures: + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"text": "The text content", "label": "The label"} + ``` + + ### CSV + Should contain 'text' and 'label' columns. + ```csv + text,label + "The text content","The label" + ``` + + ### Parquet + Should contain 'text' and 'label' columns. + + ### JSON + An array of dictionaries with 'text' and 'label' keys. + ```json + [{"text": "The text content", "label": "The label"}] + ``` + + ### XML + Each 'record' element should contain 'text' and 'label' child elements. + ```xml + + The text content + + + ``` + + ### YAML + Each document should be a dictionary with 'text' and 'label' keys. + ```yaml + - text: "The text content" + label: "The label" + ``` + + ### TSV + Should contain 'text' and 'label' columns separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'text' and 'label' columns. + + ### SQLite (.db) + Should contain a table with 'text' and 'label' columns. + + ### Feather + Should contain 'text' and 'label' columns. + """ + self.max_length = max_length + + self.label_to_id = self.model.config.label2id if self.model and self.model.config.label2id else {} # type: ignore + + try: + self.log.info(f"Loading dataset from {dataset_path}") + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + + dataset = load_from_disk(dataset_path) + else: + data = [] + for filename in os.listdir(dataset_path): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + text = record.find("text").text # type: ignore + label = record.find("label").text # type: ignore + data.append({"text": text, "label": label}) + + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT text, label FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + dataset = Dataset.from_pandas(pd.DataFrame(data)) + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + + return dataset + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def evaluate( + self, + model_name: str, + model_class: str = "AutoModelForSequenceClassification", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + batch_size: int = 32, + notification_email: Optional[str] = None, + use_huggingface_dataset: bool = False, + huggingface_dataset: str = "", + **kwargs: Any, + ) -> None: + """ + Evaluates the model on the loaded dataset, calculates evaluation metrics, and saves both predictions and metrics. + + Args: + model_name (str): Name or path of the model. + model_class (str): Class name of the model (default "AutoModelForSequenceClassification"). + tokenizer_class (str): Class name of the tokenizer (default "AutoTokenizer"). + use_cuda (bool): Whether to use CUDA for model inference (default False). + precision (str): Precision for model computation (default "float"). + quantization (int): Level of quantization for optimizing model size and speed (default 0). + device_map (str | Dict | None): Specific device to use for computation (default "auto"). + max_memory (Dict): Maximum memory configuration for devices. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool): Whether to enable AWQ optimization (default False). + flash_attention (bool): Whether to use flash attention optimization (default False). + batch_size (int): Number of classifications to process simultaneously (default 32). + use_huggingface_dataset (bool, optional): Whether to load a dataset from huggingface hub. + huggingface_dataset (str, optional): The huggingface dataset to use. + **kwargs: Arbitrary keyword arguments for model and generation configurations. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.compile = compile + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.batch_size = batch_size + self.notification_email = notification_email + self.use_huggingface_dataset = use_huggingface_dataset + self.huggingface_dataset = huggingface_dataset + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["text"] + + # Ensure metrics are available + accuracy_metric = load_metric("accuracy") + precision_metric = load_metric("precision") + recall_metric = load_metric("recall") + f1_metric = load_metric("f1") + + all_predictions = [] + all_true_labels = [] + + # Loop through the dataset in batches + for i in range(0, len(dataset), batch_size): + batch_texts = dataset[i : i + batch_size]["text"] + batch_labels = dataset[i : i + batch_size]["labels"] + inputs = self.tokenizer( + batch_texts, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt" + ) + + if self.use_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + batch_labels = torch.tensor(batch_labels).cuda() + + with torch.no_grad(): + outputs = self.model(**inputs) + predictions = torch.argmax(outputs.logits, dim=-1) + + all_predictions.extend(predictions.cpu().numpy()) + all_true_labels.extend(batch_labels.cpu().numpy()) + + # Compute overall metrics + # fmt: off + overall_accuracy = accuracy_metric.compute(predictions=all_predictions, references=all_true_labels)["accuracy"] + overall_precision = precision_metric.compute(predictions=all_predictions, references=all_true_labels, average="macro")["precision"] + overall_recall = recall_metric.compute(predictions=all_predictions, references=all_true_labels, average="macro")["recall"] + overall_f1 = f1_metric.compute(predictions=all_predictions, references=all_true_labels, average="macro")["f1"] + # fmt: on + + overall_evaluation_metrics = { + "accuracy": overall_accuracy, + "precision": overall_precision, + "recall": overall_recall, + "f1": overall_f1, + } + + # Save predictions and evaluation metrics + self._save_predictions( + all_predictions, dataset["text"], all_true_labels, self.output.output_folder, overall_evaluation_metrics + ) + + self.done() + + def _save_predictions( + self, + predictions: List[int], + input_texts: List[str], + true_labels: List[int], + output_path: str, + evaluation_metrics: Dict[str, float], + ) -> None: + """ + Saves the classification predictions and evaluation metrics to a specified output path. + + Args: + predictions (List[int]): List of label indices predicted by the model. + input_texts (List[str]): List of original texts that were classified. + true_labels (List[int]): List of true label indices. + output_path (str): Path to save the classification results and metrics. + evaluation_metrics (Dict[str, float]): Dictionary of evaluation metrics. + """ + # Prepare data for saving + data_to_save = [ + {"input": input_text, "prediction": prediction, "true_label": true_label} + for input_text, prediction, true_label in zip(input_texts, predictions, true_labels) + ] + + # Save predictions to a JSON file + predictions_file_path = os.path.join(output_path, f"predictions-{str(uuid.uuid4())}.json") + with open(predictions_file_path, "w") as f: + json.dump(data_to_save, f, ensure_ascii=False, indent=4) + + # Save evaluation metrics to a JSON file + metrics_file_path = os.path.join(output_path, "evaluation_metrics.json") + with open(metrics_file_path, "w") as f: + json.dump(evaluation_metrics, f, ensure_ascii=False, indent=4) diff --git a/geniusrise_text/classification/fine_tune.py b/geniusrise_text/classification/fine_tune.py new file mode 100644 index 0000000..33bcf51 --- /dev/null +++ b/geniusrise_text/classification/fine_tune.py @@ -0,0 +1,290 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import xml.etree.ElementTree as ET +from typing import Dict, Optional, Union + +import numpy as np +import pandas as pd +import yaml # type: ignore +from datasets import Dataset, load_dataset, load_from_disk +from pyarrow import feather +from pyarrow import parquet as pq +from sklearn.metrics import accuracy_score, precision_recall_fscore_support +from transformers import DataCollatorWithPadding, EvalPrediction + +from geniusrise_text.base import TextFineTuner + + +class TextClassificationFineTuner(TextFineTuner): + r""" + A bolt for fine-tuning Hugging Face models for text classification tasks. + + This class extends the `TextFineTuner` and specializes in fine-tuning models for text classification. + It provides additional functionalities for loading and preprocessing text classification datasets in various formats. + + Args: + input (BatchInput): The batch input data. + output (OutputConfig): The output data. + state (State): The state manager. + + CLI Usage: + + ```bash + genius TextClassificationFineTuner rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id cardiffnlp/twitter-roberta-base-hate-multiclass-latest-lol \ + fine_tune \ + --args \ + model_name=my_model \ + tokenizer_name=my_tokenizer \ + num_train_epochs=3 \ + per_device_train_batch_size=8 \ + data_max_length=512 + ``` + """ + + def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs) -> Optional[Dataset]: + r""" + Load a classification dataset from a directory. + + Args: + dataset_path (str): The path to the dataset directory. + max_length (int, optional): The maximum length for tokenization. Defaults to 512. + + Returns: + Dataset: The loaded dataset. + + Raises: + Exception: If there was an error loading the dataset. + + ## Supported Data Formats and Structures: + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"text": "The text content", "label": "The label"} + ``` + + ### CSV + Should contain 'text' and 'label' columns. + ```csv + text,label + "The text content","The label" + ``` + + ### Parquet + Should contain 'text' and 'label' columns. + + ### JSON + An array of dictionaries with 'text' and 'label' keys. + ```json + [{"text": "The text content", "label": "The label"}] + ``` + + ### XML + Each 'record' element should contain 'text' and 'label' child elements. + ```xml + + The text content + + + ``` + + ### YAML + Each document should be a dictionary with 'text' and 'label' keys. + ```yaml + - text: "The text content" + label: "The label" + ``` + + ### TSV + Should contain 'text' and 'label' columns separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'text' and 'label' columns. + + ### SQLite (.db) + Should contain a table with 'text' and 'label' columns. + + ### Feather + Should contain 'text' and 'label' columns. + """ + + self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, max_length=max_length) + self.max_length = max_length + + # self.label_to_id = self.model.config.label2id if self.model and self.model.config.label2id else {} # type: ignore + self.label_to_id = {} + + def tokenize_function(examples): + tokenized_data = self.tokenizer( + examples["text"], + padding="max_length", + truncation=True, + max_length=self.max_length, + ) + + labels = [x for x in list(set(examples["label"]))] + all_labels = [l for l in examples["label"]] + + unknown_labels = [label for label in labels if label not in self.label_to_id] + + self.label_to_id = { + **self.label_to_id, + **{x: i for i, x in enumerate(unknown_labels)}, + } + + tokenized_data["label"] = [self.label_to_id[label] for label in all_labels] + return tokenized_data + + try: + self.log.info(f"Loading dataset from {dataset_path}") + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + # Load dataset saved by Hugging Face datasets library + dataset = load_from_disk(dataset_path) + else: + data = [] + for filename in os.listdir(dataset_path): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + text = record.find("text").text # type: ignore + label = record.find("label").text # type: ignore + data.append({"text": text, "label": label}) + + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT text, label FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + dataset = Dataset.from_pandas(pd.DataFrame(data)) + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + + # Create label_to_id mapping and save it in model config + # TODO: ugly shit cause we dont know num labels before we process the data but need tokenizer to process data + self.label_to_id = {label: i for i, label in enumerate(set(dataset["train"]["label"]))} + if self.model: + config = self.model.config + config.label2id = self.label_to_id + config.id2label = {i: label for label, i in self.label_to_id.items()} + config.num_labels = len(self.label_to_id.keys()) + self.config = config + + self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + device_map=self.device_map, + precision=self.precision, + quantization=self.quantization, + lora_config=self.lora_config, + use_accelerate=self.use_accelerate, + accelerate_no_split_module_classes=self.accelerate_no_split_module_classes, + **self.model_kwargs, + ) + if self.tokenizer and not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + self.model.config.pad_token_id = self.tokenizer.eos_token_id + + self.log.info(self.model.config) + + return dataset.map(tokenize_function, batched=True) + + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def compute_metrics(self, eval_pred: EvalPrediction) -> Union[Optional[Dict[str, float]], Dict[str, float]]: + """ + Compute metrics for evaluation. This class implements a simple classification evaluation, + tasks should ideally override this. + + Args: + eval_pred (EvalPrediction): The evaluation predictions. + + Returns: + dict: The computed metrics. + """ + predictions, labels = eval_pred + predictions = predictions[0] if isinstance(predictions, tuple) else predictions + labels = labels[0] if isinstance(labels, tuple) else labels + predictions = np.argmax(predictions, axis=1) + + is_binary = len(self.label_to_id.keys()) == 2 + average_type = "binary" if is_binary else "weighted" + + precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=average_type) + + return { + "accuracy": accuracy_score(labels, predictions), + "precision": precision, + "recall": recall, + "f1": f1, + } diff --git a/geniusrise_text/classification/tests/__init__.py b/geniusrise_text/classification/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/classification/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/classification/tests/test_bulk.py b/geniusrise_text/classification/tests/test_bulk.py new file mode 100644 index 0000000..3063aaf --- /dev/null +++ b/geniusrise_text/classification/tests/test_bulk.py @@ -0,0 +1,186 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import pandas as pd +import pytest +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.classification.bulk import TextClassificationBulk + +MODELS_TO_TEST = { + # fmt: off + "cardiffnlp/twitter-roberta-base-hate-multiclass-latest": ["sexism", "racism", "disability", "sexual_orientation", "religion", "other", "not_hate"], + "cardiffnlp/twitter-roberta-base-hate-latest": ["NOT-HATE", "HATE"], + "cardiffnlp/twitter-roberta-base-offensive": ["non-offensive", "offensive"], + "cardiffnlp/twitter-xlm-roberta-base-sentiment": ["positive", "neutral", "negative"], + "cardiffnlp/twitter-roberta-base-emotion": ["joy", "optimism", "anger", "sadness"], + "cardiffnlp/twitter-roberta-base-irony": ["non_irony", "irony"], + "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual": ["positive", "neutral", "negative"], + "tomh/toxigen_roberta": ["LABEL_0", "LABEL_1"], + "cointegrated/rubert-tiny-toxicity": ["non-toxic", "insult", "obscenity", "threat", "dangerous"], + "michellejieli/NSFW_text_classifier": ["SFW", "NSFW"], + "bvanaken/clinical-assertion-negation-bert": ["ABSENT", "PRESENT", "POSSIBLE"], + "bucketresearch/politicalBiasBERT": ["LEFT", "CENTER", "RIGHT"], + "soleimanian/financial-roberta-large-sentiment": ["neutral", "negative", "positive"], + "jpwahle/longformer-base-plagiarism-detection": ["ORIGINAL", "PLAGIARISM"], + "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis": ["negative", "neutral", "positive"], + "lxyuan/distilbert-base-multilingual-cased-sentiments-student": ["negative", "neutral", "positive"], + "Sigma/financial-sentiment-analysis": ["LABEL_0", "LABEL_1", "LABEL_2"], + "SamLowe/roberta-base-go_emotions": ["disappointment", "sadness", "annoyance", "neutral", "disapproval", "realization", + "nervousness", "approval", "joy", "anger", "embarrassment", "caring", "remorse", + "disgust", "grief", "confusion", "relief", "desire", "admiration", "optimism", + "fear", "love", "excitement", "curiosity", "amusement", "surprise", "gratitude", "pride"], + "cardiffnlp/tweet-topic-21-multi": ["sports", "news_&_social_concern", "fitness_&_health", "youth_&_student_life", "learning_&_educational", + "science_&_technology", "celebrity_&_pop_culture", "travel_&_adventure", "diaries_&_daily_life", + "food_&_dining", "gaming", "business_&_entrepreneurs", "family", "relationships", "fashion_&_style", + "music", "film_tv_&_video", "other_hobbies", "arts_&_culture"], + "padmajabfrl/Gender-Classification": ["Female", "Male"], + "ProsusAI/finbert": ["positive", "neutral", "negative"], + "yiyanghkust/finbert-tone": ["Positive", "Neutral", "Negative"], + "wajidlinux99/gibberish-text-detector": ["clean", "mild gibberish", "word salad", "noise"], + "cnut1648/biolinkbert-large-mnli-snli": ["entailment", "neutral", "contradiction"], + # fmt: on +} + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"text": f"text_{i}"} for i in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "text").text = item["text"] + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "jsonl", + "parquet", + "json", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir, ext) + return tmpdir, ext + + +@pytest.fixture +def classification_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + klass = TextClassificationBulk( + input=input, + output=output, + state=state, + ) + + return klass + + +def test_classify(classification_bolt, dataset_file, model): + tmpdir, ext = dataset_file + classification_bolt.input.input_folder = tmpdir + + model_name, labels = model + tokenizer_name = model_name + model_class = "AutoModelForSequenceClassification" + tokenizer_class = "AutoTokenizer" + + # Classify + classification_bolt.classify( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + num_train_epochs=2, + per_device_batch_size=2, + precision="float16", + ) + # Check output + files = glob.glob(f"{classification_bolt.output.output_folder}/predictions-*.json") + assert len(files) > 0 + + # Check one of the output files to ensure it contains the predictions and input data + with open(files[0], "r") as f: + results = json.load(f) + assert len(results) > 0 + for result in results: + assert "input" in result + assert "prediction" in result + assert result["prediction"] in MODELS_TO_TEST[model_name] diff --git a/tests/test_classification.py b/geniusrise_text/classification/tests/test_fine_tune.py similarity index 58% rename from tests/test_classification.py rename to geniusrise_text/classification/tests/test_fine_tune.py index 8b335e4..9b401a4 100644 --- a/tests/test_classification.py +++ b/geniusrise_text/classification/tests/test_fine_tune.py @@ -1,18 +1,17 @@ # 🧠 Geniusrise # Copyright (C) 2023 geniusrise.ai # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import os @@ -20,7 +19,6 @@ import tempfile import xml.etree.ElementTree as ET -import numpy as np import pandas as pd import pytest import yaml # type: ignore @@ -28,9 +26,21 @@ from geniusrise.core import BatchInput, BatchOutput, InMemoryState from pyarrow import feather from pyarrow import parquet as pq -from transformers import EvalPrediction -from huggingface import HuggingFaceClassificationFineTuner +from geniusrise_text import TextClassificationFineTuner + +# Models to test +MODELS_TO_TEST = { + # fmt: off + "bert-base-uncased": ["LABEL_0", "LABEL_1", "LABEL_2"], + "bert-large-uncased": ["LABEL_0", "LABEL_1", "LABEL_2"], + "distilroberta-base": ["LABEL_0", "LABEL_1", "LABEL_2"], + "xlm-roberta-large": ["LABEL_0", "LABEL_1", "LABEL_2"], + "albert-base-v2": ["LABEL_0", "LABEL_1", "LABEL_2"], + "cardiffnlp/twitter-roberta-base-2022-154m": ["LABEL_0", "LABEL_1", "LABEL_2"], + "cardiffnlp/twitter-roberta-base": ["LABEL_0", "LABEL_1", "LABEL_2"], + # fmt: on +} # Helper function to create synthetic data in different formats @@ -76,6 +86,12 @@ def create_dataset_in_format(directory, ext): feather.write_feather(df, os.path.join(directory, "data.feather")) +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + # Fixtures for each file type @pytest.fixture( params=[ @@ -103,23 +119,31 @@ def dataset_file(request, tmpdir): def classification_bolt(): input_dir = tempfile.mkdtemp() output_dir = tempfile.mkdtemp() - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") state = InMemoryState() - klass = HuggingFaceClassificationFineTuner( + klass = TextClassificationFineTuner( input=input, output=output, state=state, ) - klass.model_class = "BertForSequenceClassification" - klass.model_name = "bert-base-uncased" - klass.tokenizer_class = "BertTokenizer" - klass.tokenizer_name = "bert-base-uncased" + return klass -def test_classification_bolt_init(classification_bolt): - classification_bolt.load_models() +def test_classification_bolt_init(classification_bolt, model): + model_name, labels = model + tokenizer_name = model_name + model_class = "AutoModelForSequenceClassification" + tokenizer_class = "AutoTokenizer" + + classification_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) assert classification_bolt.model is not None assert classification_bolt.tokenizer is not None @@ -128,43 +152,52 @@ def test_classification_bolt_init(classification_bolt): assert classification_bolt.state is not None -def test_load_dataset_all_formats(classification_bolt, dataset_file): +def test_load_dataset_all_formats(classification_bolt, dataset_file, model): tmpdir, ext = dataset_file dataset_path = os.path.join(tmpdir, "train") - classification_bolt.load_models() + model_name, labels = model + tokenizer_name = model_name + model_class = "AutoModelForSequenceClassification" + tokenizer_class = "AutoTokenizer" + + classification_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + dataset = classification_bolt.load_dataset(dataset_path) assert dataset is not None assert len(dataset) == 10 # Test for fine-tuning -def test_classification_bolt_fine_tune(classification_bolt, dataset_file): +def test_classification_bolt_fine_tune(classification_bolt, dataset_file, model): tmpdir, ext = dataset_file classification_bolt.input.input_folder = tmpdir + model_name, labels = model + tokenizer_name = model_name + model_class = "AutoModelForSequenceClassification" + tokenizer_class = "AutoTokenizer" + # kwargs = {"model_"} + classification_bolt.fine_tune( - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BertForSequenceClassification", - model_name="bert-base-uncased", - tokenizer_class="BertTokenizer", - tokenizer_name="bert-base-uncased", + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + num_train_epochs=2, + per_device_batch_size=2, + evaluate=True, + precision="float16", ) output_dir = classification_bolt.output.output_folder assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -# Test for computing metrics -def test_classification_bolt_compute_metrics(classification_bolt): - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = classification_bolt.compute_metrics(eval_pred) - assert "accuracy" in metrics - assert "precision" in metrics - assert "recall" in metrics - assert "f1" in metrics diff --git a/geniusrise_text/embeddings/__init__.py b/geniusrise_text/embeddings/__init__.py new file mode 100644 index 0000000..63fbeed --- /dev/null +++ b/geniusrise_text/embeddings/__init__.py @@ -0,0 +1,17 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import EmbeddingsAPI +from .bulk import EmbeddingsBulk diff --git a/geniusrise_text/embeddings/api.py b/geniusrise_text/embeddings/api.py new file mode 100644 index 0000000..1bd9b34 --- /dev/null +++ b/geniusrise_text/embeddings/api.py @@ -0,0 +1,341 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, Optional + +import cherrypy +from sentence_transformers import SentenceTransformer + +from geniusrise_text.base import TextAPI +from geniusrise_text.embeddings.embeddings import ( + generate_combination_embeddings, + generate_contiguous_embeddings, + generate_embeddings, + generate_permutation_embeddings, + generate_sentence_transformer_embeddings, +) + + +class EmbeddingsAPI(TextAPI): + r""" + A CherryPy API for generating various types of embeddings using Hugging Face and Sentence Transformer models. + + This API exposes endpoints for generating embeddings using Sentence-BERT, as well as Hugging Face models. + It supports generating embeddings for individual terms, contiguous subsets of words, combinations of words, + and permutations of words in a given sentence. + + Args: + Inherits all arguments from TextAPI. + + CLI Usage: + + ```bash + genius EmbeddingsAPI rise \ + listen \ + --model_name=bert-base-uncased \ + --model_class=AutoModelForCausalLM \ + --tokenizer_class=AutoTokenizer \ + --sentence_transformer_model=paraphrase-MiniLM-L6-v2 \ + --use_cuda=True \ + --precision=float16 \ + --device_map=auto \ + --max_memory={0: "24GB"} \ + --torchscript=True \ + --endpoint="*" \ + --port=3000 \ + --cors_domain="http://localhost:3000" + ``` + + YAML Configuration: + + ```yaml + version: "1" + bolts: + my_embeddings_api: + name: "EmbeddingsAPI" + method: "listen" + args: + model_name: "bert-base-uncased" + model_class: "AutoModelForCausalLM" + tokenizer_class: "AutoTokenizer" + sentence_transformer_model: "paraphrase-MiniLM-L6-v2" + use_cuda: True + precision: "float16" + device_map: "auto" + max_memory: {0: "24GB"} + torchscript: True + endpoint: "*" + port: 3000 + cors_domain: "http://localhost:3000" + ``` + + Supported Endpoints: + - POST /sbert_embeddings: Generate embeddings using Sentence-BERT. + - POST /embeddings: Generate embeddings for a given term. + - POST /embeddings_contiguous: Generate embeddings for contiguous subsets of words. + - POST /embeddings_combinations: Generate embeddings for combinations of words. + - POST /embeddings_permutations: Generate embeddings for permutations of words. + """ + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def sbert(self, **kwargs: Any) -> Dict[str, Any]: + """ + Generate embeddings using Sentence-BERT model. + + Parameters: + - **kwargs (Any): Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the generated embeddings. + + Usage: + POST request with JSON payload containing 'sentences' and optional 'batch_size'. + """ + data = cherrypy.request.json + sentences = data.get("sentences") + batch_size = data.get("batch_size", 32) + + embeddings = generate_sentence_transformer_embeddings( + sentences=sentences, model=self.sentence_transformer_model, use_cuda=self.use_cuda, batch_size=batch_size + ) + return {"embeddings": embeddings.tolist()} + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def sentence(self, **kwargs: Any) -> Dict[str, Any]: + """ + Generate embeddings for a given term using Hugging Face model. + + Parameters: + - **kwargs (Any): Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the generated embeddings. + + Usage: + POST request with JSON payload containing 'term'. + """ + data = cherrypy.request.json + sentence = data.get("sentence") + + embeddings = generate_embeddings( + sentence=sentence, + model=self.model, + tokenizer=self.tokenizer, + output_key="last_hidden_state", + use_cuda=self.use_cuda, + ) + return {"embeddings": embeddings.tolist()} + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def sentence_windows(self, **kwargs: Any) -> Dict[str, Any]: + """ + Generate embeddings for all contiguous subsets of words in a given sentence. + + Parameters: + - **kwargs (Any): Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the generated embeddings. + + Usage: + POST request with JSON payload containing 'sentence'. + """ + data = cherrypy.request.json + sentence = data.get("sentence") + + embeddings = generate_contiguous_embeddings( + sentence=sentence, + model=self.model, + tokenizer=self.tokenizer, + output_key="last_hidden_state", + use_cuda=self.use_cuda, + ) + return {"embeddings": embeddings} + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def sentence_combinations(self, **kwargs: Any) -> Dict[str, Any]: + """ + Generate embeddings for all combinations of words in a given sentence. + + Parameters: + - **kwargs (Any): Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the generated embeddings. + + Usage: + POST request with JSON payload containing 'sentence'. + """ + data = cherrypy.request.json + sentence = data.get("sentence") + + embeddings = generate_combination_embeddings( + sentence=sentence, + model=self.model, + tokenizer=self.tokenizer, + output_key="last_hidden_state", + use_cuda=self.use_cuda, + ) + return {"embeddings": embeddings} + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def sentence_permutations(self, **kwargs: Any) -> Dict[str, Any]: + """ + Generate embeddings for all permutations of words in a given sentence. + + Parameters: + - **kwargs (Any): Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the generated embeddings. + + Usage: + POST request with JSON payload containing 'sentence'. + """ + data = cherrypy.request.json + sentence = data.get("sentence") + + embeddings = generate_permutation_embeddings( + sentence=sentence, + model=self.model, + tokenizer=self.tokenizer, + output_key="last_hidden_state", + use_cuda=self.use_cuda, + ) + return {"embeddings": embeddings} + + def listen( # type: ignore + self, + model_name: str, + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + endpoint: str = "*", + port: int = 3000, + cors_domain: str = "http://localhost:3000", + username: Optional[str] = None, + password: Optional[str] = None, + **model_args: Any, + ) -> None: + """ + Initialize and start the API server. + + Parameters: + - model_name (str): The name of the Hugging Face model to use. + - model_class (str, optional): The class name of the model. Defaults to "AutoModelForCausalLM". + - tokenizer_class (str, optional): The class name of the tokenizer. Defaults to "AutoTokenizer". + - sentence_transformer_model (str, optional): The name of the Sentence Transformer model to use. Defaults to "paraphrase-MiniLM-L6-v2". + - use_cuda (bool, optional): Whether to use CUDA for computation. Defaults to False. + - precision (str, optional): The precision to use for computations. Defaults to "float16". + - device_map (str | Dict | None, optional): The device map for distributed training. Defaults to "auto". + - max_memory (Dict, optional): The maximum memory to allocate for each device. Defaults to {0: "24GB"}. + - torchscript (bool, optional): Whether to use TorchScript. Defaults to True. + - endpoint (str, optional): The API endpoint. Defaults to "*". + - port (int, optional): The port to listen on. Defaults to 3000. + - cors_domain (str, optional): The CORS domain. Defaults to "http://localhost:3000". + - username (str, optional): The username for authentication. Defaults to None. + - password (str, optional): The password for authentication. Defaults to None. + - **model_args (Any): Additional arguments for the model. + + Returns: + None + """ + self.model_name = model_name + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.quantization = quantization + self.precision = precision + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.model_args = model_args + + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + self.model_name = model_name + self.model_revision = model_revision + self.tokenizer_name = tokenizer_name + self.tokenizer_revision = tokenizer_revision + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + **self.model_args, + ) + self.sentence_transformer_model = SentenceTransformer(model_name, device="cuda" if use_cuda else "cpu") + + def CORS(): + cherrypy.response.headers["Access-Control-Allow-Origin"] = "http://localhost:3000" + cherrypy.response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS" + cherrypy.response.headers["Access-Control-Allow-Headers"] = "Content-Type" + cherrypy.response.headers["Access-Control-Allow-Credentials"] = "true" + + if cherrypy.request.method == "OPTIONS": + cherrypy.response.status = 200 + return True + + cherrypy.config.update( + { + "server.socket_host": "0.0.0.0", + "server.socket_port": port, + "log.screen": False, + "tools.CORS.on": True, + } + ) + + cherrypy.tools.CORS = cherrypy.Tool("before_handler", CORS) + cherrypy.tree.mount(self, "/api/v1/", {"/": {"tools.CORS.on": True}}) + cherrypy.tools.CORS = cherrypy.Tool("before_finalize", CORS) + cherrypy.engine.start() + cherrypy.engine.block() diff --git a/geniusrise_text/embeddings/bulk.py b/geniusrise_text/embeddings/bulk.py new file mode 100644 index 0000000..3cb792e --- /dev/null +++ b/geniusrise_text/embeddings/bulk.py @@ -0,0 +1,413 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import pickle +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, Optional, Tuple + +import pandas as pd +import pyarrow.feather as feather +import pyarrow.parquet as pq +import torch +import transformers +import yaml # type: ignore +from datasets import Dataset +from geniusrise import BatchInput, BatchOutput, Bolt, State +from geniusrise.logging import setup_logger +from sentence_transformers import SentenceTransformer +from transformers import AutoModelForCausalLM, AutoTokenizer + +from geniusrise_text.embeddings.embeddings import ( + generate_combination_embeddings, + generate_contiguous_embeddings, + generate_permutation_embeddings, + generate_sentence_transformer_embeddings, +) + + +class EmbeddingsBulk(Bolt): + r""" + The `EmbeddingsBulk` class is designed to generate embeddings in bulk for various types of text data. + It supports multiple data formats: JSONL, CSV, Parquet, JSON, XML, YAML, TSV, Excel, SQLite, and Feather. + + Args: + input (BatchInput): An instance of the BatchInput class for reading the data. + output (BatchOutput): An instance of the BatchOutput class for saving the data. + state (State): An instance of the State class for maintaining the state. + **kwargs: Additional keyword arguments. + + CLI Usage: + + ```bash + genius EmbeddingsBulk rise \ + batch \ + --bucket my_bucket \ + --s3_folder s3/input \ + batch \ + --bucket my_bucket \ + --s3_folder s3/output \ + none \ + process \ + --args model_name=bert-base-uncased tokenizer_name=bert-base-uncased use_cuda=true + ``` + + YAML Configuration: + + ```yaml + version: "1" + bolts: + generate_embeddings: + name: "EmbeddingsBulk" + method: "process" + args: + model_name: "bert-base-uncased" + tokenizer_name: "bert-base-uncased" + use_cuda: true + input: + type: "batch" + args: + bucket: "my_bucket" + s3_folder: "s3/input" + output: + type: "batch" + args: + bucket: "my_bucket" + s3_folder: "s3/output" + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + super().__init__(input, output, state, **kwargs) + self.log = setup_logger(self.state) + + def load_models( + self, + model_name: str, + tokenizer_name: str, + model_revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + **model_args: Any, + ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: + """ + Loads a Hugging Face model and tokenizer optimized for inference. + + Parameters: + - model_name (str): The name of the model to load. + - model_class (str): The class name of the model to load. Default is "AutoModelForCausalLM". + - tokenizer_class (str): The class name of the tokenizer to load. Default is "AutoTokenizer". + - use_cuda (bool): Whether to use CUDA for GPU acceleration. Default is False. + - precision (str): The bit precision for model and tokenizer. Options are 'float32', 'float16', 'bfloat16'. Default is 'float16'. + - device_map (Union[str, Dict]): Device map for model placement. Default is "auto". + - max_memory (Dict): Maximum GPU memory to be allocated. + - model_args (Any): Additional keyword arguments for the model. + + Returns: + Tuple[AutoModelForCausalLM, AutoTokenizer]: The loaded model and tokenizer. + + Usage: + ```python + model, tokenizer = load_models("gpt-2", use_cuda=True, precision='float32', quantize=True, quantize_bits=8) + ``` + """ + self.log.info(f"Loading Hugging Face model: {model_name}") + + # Determine the torch dtype based on precision + if precision == "float16": + torch_dtype = torch.float16 + elif precision == "float32": + torch_dtype = torch.float32 + elif precision == "bfloat16": + torch_dtype = torch.bfloat16 + else: + raise ValueError("Unsupported precision. Choose from 'float32', 'float16', 'bfloat16'.") + + if use_cuda and not device_map: + device_map = "auto" + + ModelClass = getattr(transformers, model_class) + TokenizerClass = getattr(transformers, tokenizer_class) + + # Load the model and tokenizer + tokenizer = TokenizerClass.from_pretrained(tokenizer_name, revision=tokenizer_revision, torch_dtype=torch_dtype) + + self.log.info(f"Loading model from {model_name} {model_revision} with {model_args}") + if quantization == 8: + model = ModelClass.from_pretrained( + model_name, + revision=model_revision, + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + load_in_8bit=True, + **model_args, + ) + elif quantization == 4: + model = ModelClass.from_pretrained( + model_name, + revision=model_revision, + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + load_in_4bit=True, + **model_args, + ) + else: + model = ModelClass.from_pretrained( + model_name, + revision=model_revision, + torch_dtype=torch_dtype, + torchscript=torchscript, + max_memory=max_memory, + device_map=device_map, + **model_args, + ) + + # Set to evaluation mode for inference + model.eval() + + if tokenizer and tokenizer.eos_token and (not tokenizer.pad_token): + tokenizer.pad_token = tokenizer.eos_token + + self.log.debug("Hugging Face model and tokenizer loaded successfully.") + return model, tokenizer + + def generate( + self, + kind: str, + model_name: str, + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + sentence_transformer_model: str = "paraphrase-MiniLM-L6-v2", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + batch_size: int = 32, + **model_args: Any, + ) -> None: + """ + Generate embeddings in bulk for various types of text data. + + Args: + **kwargs: Additional keyword arguments. + + This method reads text data from the specified input path, generates embeddings, and saves them to the specified output path. + """ + self.model_name = model_name + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.quantization = quantization + self.precision = precision + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.model_args = model_args + + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + self.model_name = model_name + self.model_revision = model_revision + self.tokenizer_name = tokenizer_name + self.tokenizer_revision = tokenizer_revision + + if self.use_cuda and self.device_map is None: + self.device_map = "cuda:0" + + if kind == "sentence": + self.sentence_transformer_model = SentenceTransformer(model_name, device="cuda" if use_cuda else "cpu") + else: + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self._load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["text"] + + # Generate embeddings + embeddings: Any + if kind == "sentence": + embeddings = generate_sentence_transformer_embeddings( + sentences=dataset, + model=self.sentence_transformer_model, + use_cuda=self.use_cuda, + batch_size=batch_size, + ) + return self._save_embeddings(embeddings, output_path) + elif kind == "sentence_windows": + embeddings = [ + generate_contiguous_embeddings( + sentence=sentence, + model=self.model, + tokenizer=self.tokenizer, + output_key="last_hidden_state", + use_cuda=self.use_cuda, + ) + for sentence in dataset + ] + return self._save_embeddings(embeddings, output_path) + elif kind == "sentence_combinations": + embeddings = [ + generate_combination_embeddings( + sentence=sentence, + model=self.model, + tokenizer=self.tokenizer, + output_key="last_hidden_state", + use_cuda=self.use_cuda, + ) + for sentence in dataset + ] + return self._save_embeddings(embeddings, output_path) + elif kind == "sentence_permutations": + embeddings = [ + generate_permutation_embeddings( + sentence=sentence, + model=self.model, + tokenizer=self.tokenizer, + output_key="last_hidden_state", + use_cuda=self.use_cuda, + ) + for sentence in dataset + ] + return self._save_embeddings(embeddings, output_path) + + def _load_dataset(self, dataset_path: str) -> Optional[Dataset]: + """ + Load a text dataset from a directory. + + Args: + dataset_path (str): The path to the dataset directory. + + Returns: + Dataset: The loaded dataset. + + Raises: + Exception: If there was an error loading the dataset. + """ + data = [] + for filename in os.listdir(dataset_path): + filepath = os.path.join(dataset_path, filename) + try: + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + text = record.find("text").text # type: ignore + data.append({"text": text}) + + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT text FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {filepath}. Error: {e}") + raise + + if not data: + self.log.error("No data found.") + return None + + return Dataset.from_pandas(pd.DataFrame(data)) + + def _save_embeddings(self, embeddings: Dict[str, Any], output_path: str) -> None: + """ + Save the generated embeddings to the specified output path. + + Args: + embeddings (Dict[str, Any]): A dictionary containing the generated embeddings. + output_path (str): The path to save the embeddings. + """ + with open(os.path.join(output_path, f"embeddings-{str(uuid.uuid4())}.json"), "wb") as f: + pickle.dump(embeddings, f) diff --git a/geniusrise_text/embeddings/embeddings.py b/geniusrise_text/embeddings/embeddings.py new file mode 100644 index 0000000..05d7daa --- /dev/null +++ b/geniusrise_text/embeddings/embeddings.py @@ -0,0 +1,297 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from itertools import combinations, permutations +from typing import Any, List, Tuple, Union + +import numpy as np +import torch +from transformers import PreTrainedModel, PreTrainedTokenizer + +log = logging.getLogger(__name__) + + +def generate_sentence_transformer_embeddings( + sentences: Union[str, List[str]], model: Any, use_cuda: bool = False, batch_size: int = 32 +) -> np.ndarray: + """ + Generates embeddings for given sentences using sentence-transformers. + + Parameters: + - sentences (Union[str, List[str]]): The sentence(s) for which to generate the embeddings. + - model (Any): The sentence-transformer model to use. + - use_cuda (bool, optional): Whether to use CUDA for computation. Defaults to False. + - batch_size (int, optional): Batch size for the sentence-transformer model. Defaults to 32. + + Returns: + np.ndarray: The generated embeddings. If a single sentence was passed, returns a single embedding. + + Note: + - The embeddings are directly from the sentence-transformer model. + """ + + # Check if input is a single sentence or a list of sentences + is_single_sentence = isinstance(sentences, str) + + # Convert to list if it's a single sentence + if is_single_sentence: + sentences = [sentences] # type: ignore + + # Generate embeddings + embeddings = model.encode(sentences, convert_to_numpy=use_cuda, batch_size=batch_size) + + # Return the single embedding if a single sentence was passed + if is_single_sentence: + return embeddings[0] + + return embeddings + + +def generate_embeddings( + sentence: str, + model: PreTrainedModel, + tokenizer: PreTrainedTokenizer, + output_key: str = "last_hidden_state", + use_cuda: bool = False, +) -> np.ndarray: + """ + Generates embeddings for a given sentence using a Hugging Face model. + + Parameters: + - sentence (str): The sentence for which to generate the embeddings. + - model (PreTrainedModel): The Hugging Face model to use. + - tokenizer (PreTrainedTokenizer): The tokenizer for the model. + - output_key (str, optional): The key to use to extract embeddings from the model output. Defaults to 'last_hidden_state'. + - use_cuda (bool, optional): Whether to use CUDA for computation. Defaults to False. + + Returns: + np.ndarray: The generated embeddings, averaged along the sequence length dimension. + """ + # Generate inputs + inputs = tokenizer(sentence, return_tensors="pt") + + # Move inputs to the same device as the model + inputs = {k: v.to(model.device) for k, v in inputs.items()} + + # Generate outputs + with torch.no_grad(): # Deactivate autograd to reduce memory usage + outputs = model(**inputs) + + # Extract embeddings + if isinstance(outputs, dict): + embeddings = outputs.get(output_key, None) + elif isinstance(outputs, tuple): + embeddings = outputs[0] + else: + raise ValueError("Unsupported model output type") + + if embeddings is None: + raise ValueError(f"Could not find key '{output_key}' in model outputs") + + # Average along the sequence length dimension + embeddings = embeddings.mean(dim=1) + + # Move to CPU and convert to NumPy + if not use_cuda: + embeddings = embeddings.cpu().numpy() + return embeddings + + +def generate_contiguous_embeddings( + sentence: str, + model: PreTrainedModel, + tokenizer: PreTrainedTokenizer, + output_key: str = "last_hidden_state", + use_cuda: bool = False, +) -> List[Tuple[np.ndarray, str]]: + """ + Generates embeddings for all contiguous subsets of words in a given sentence using a Hugging Face model. + + Parameters: + - sentence (str): The sentence for which to generate the embeddings. Can contain multiple words separated by space. + - model (PreTrainedModel): The Hugging Face model to use. + - tokenizer (PreTrainedTokenizer): The tokenizer for the model. + - output_key (str, optional): The key to use to extract embeddings from the model output. Defaults to 'last_hidden_state'. + - use_cuda (bool, optional): Whether to use CUDA for computation. Defaults to False. + + Returns: + List[Tuple[np.ndarray, str]]: A list of tuples, each containing the generated embeddings and the term. + """ + words = sentence.split() + embeddings_list = [] + + for end_idx in range(1, len(words) + 1): + for start_idx in range(0, end_idx): + sub_sentence = " ".join(words[start_idx:end_idx]) + + # Generate inputs + inputs = tokenizer(sub_sentence, return_tensors="pt") + + # Move inputs to the same device as the model + inputs = {k: v.to(model.device) for k, v in inputs.items()} + + # Generate outputs + with torch.no_grad(): # Deactivate autograd to reduce memory usage + outputs = model(**inputs) + + # Extract embeddings + if isinstance(outputs, dict): + embeddings = outputs.get(output_key, None) + elif isinstance(outputs, tuple): + embeddings = outputs[0] + else: + raise ValueError("Unsupported model output type") + + if embeddings is None: + raise ValueError(f"Could not find key '{output_key}' in model outputs") + + # Average along the sequence length dimension + embeddings = embeddings.mean(dim=1) + + # Move to CPU and convert to NumPy + if not use_cuda: + embeddings = embeddings.cpu().numpy() + + # Append embeddings and term length to the list + embeddings_list.append((embeddings, sub_sentence)) + + return embeddings_list + + +def generate_combination_embeddings( + sentence: str, + model: PreTrainedModel, + tokenizer: PreTrainedTokenizer, + output_key: str = "last_hidden_state", + use_cuda: bool = False, +) -> List[Tuple[np.ndarray, str]]: + """ + Generates embeddings for all combinations of words in a given sentence using a Hugging Face model. + + Parameters: + - sentence (str): The sentence for which to generate the embeddings. Can contain multiple words separated by space. + - model (PreTrainedModel): The Hugging Face model to use. + - tokenizer (PreTrainedTokenizer): The tokenizer for the model. + - output_key (str, optional): The key to use to extract embeddings from the model output. Defaults to 'last_hidden_state'. + - use_cuda (bool, optional): Whether to use CUDA for computation. Defaults to False. + + Returns: + List[Tuple[np.ndarray, str]]: A list of tuples, each containing the generated embeddings and the term. + """ + words = sentence.split() + all_combinations = [] + for r in range(1, len(words) + 1): + for subset in combinations(words, r): + all_combinations.append(" ".join(subset)) + + embeddings_list = [] + + for comb_term in all_combinations: + # Generate inputs + inputs = tokenizer(comb_term, return_tensors="pt") + + # Move inputs to the same device as the model + inputs = {k: v.to(model.device) for k, v in inputs.items()} + + # Generate outputs + with torch.no_grad(): # Deactivate autograd to reduce memory usage + outputs = model(**inputs) + + # Extract embeddings + if isinstance(outputs, dict): + embeddings = outputs.get(output_key, None) + elif isinstance(outputs, tuple): + embeddings = outputs[0] + else: + raise ValueError("Unsupported model output type") + + if embeddings is None: + raise ValueError(f"Could not find key '{output_key}' in model outputs") + + # Average along the sequence length dimension + embeddings = embeddings.mean(dim=1) + + # Move to CPU and convert to NumPy + if not use_cuda: + embeddings = embeddings.cpu().numpy() + + # Append embeddings and term length to the list + embeddings_list.append((embeddings, comb_term)) + + return embeddings_list + + +def generate_permutation_embeddings( + sentence: str, + model: PreTrainedModel, + tokenizer: PreTrainedTokenizer, + output_key: str = "last_hidden_state", + use_cuda: bool = False, +) -> List[Tuple[np.ndarray, str]]: + """ + Generates embeddings for all permutations of words in a given sentence using a Hugging Face model. + + Parameters: + - sentence (str): The sentence for which to generate the embeddings. Can contain multiple words separated by space. + - model (PreTrainedModel): The Hugging Face model to use. + - tokenizer (PreTrainedTokenizer): The tokenizer for the model. + - output_key (str, optional): The key to use to extract embeddings from the model output. Defaults to 'last_hidden_state'. + - use_cuda (bool, optional): Whether to use CUDA for computation. Defaults to False. + + Returns: + List[Tuple[np.ndarray, str]]: A list of tuples, each containing the generated embeddings and the term. + """ + words = sentence.split() + all_permutations = [] + for r in range(1, len(words) + 1): + for subset in permutations(words, r): + all_permutations.append(" ".join(subset)) + + embeddings_list = [] + + for comb_term in all_permutations: + # Generate inputs + inputs = tokenizer(comb_term, return_tensors="pt") + + # Move inputs to the same device as the model + inputs = {k: v.to(model.device) for k, v in inputs.items()} + + # Generate outputs + with torch.no_grad(): # Deactivate autograd to reduce memory usage + outputs = model(**inputs) + + # Extract embeddings + if isinstance(outputs, dict): + embeddings = outputs.get(output_key, None) + elif isinstance(outputs, tuple): + embeddings = outputs[0] + else: + raise ValueError("Unsupported model output type") + + if embeddings is None: + raise ValueError(f"Could not find key '{output_key}' in model outputs") + + # Average along the sequence length dimension + embeddings = embeddings.mean(dim=1) + + # Move to CPU and convert to NumPy + if not use_cuda: + embeddings = embeddings.cpu().numpy() + + # Append embeddings and term length to the list + embeddings_list.append((embeddings, comb_term)) + + return embeddings_list diff --git a/geniusrise_text/embeddings/tests/__init__.py b/geniusrise_text/embeddings/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/embeddings/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/embeddings/tests/test_bulk.py b/geniusrise_text/embeddings/tests/test_bulk.py new file mode 100644 index 0000000..6b907b7 --- /dev/null +++ b/geniusrise_text/embeddings/tests/test_bulk.py @@ -0,0 +1,131 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import xml.etree.ElementTree as ET + +import pandas as pd +import pytest +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.embeddings.bulk import EmbeddingsBulk + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"text": f"text_{i}"} for i in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "text").text = item["text"] + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + return directory + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "csv", + "json", + "jsonl", + "parquet", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + directory = create_dataset_in_format(f"{tmpdir}/input", ext) + return directory, ext + + +@pytest.fixture +def embeddings_bulk_bolt(tmpdir): + input_dir = f"{tmpdir}/input" + output_dir = f"{tmpdir}/output" + os.makedirs(input_dir) + os.makedirs(output_dir) + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + bolt = EmbeddingsBulk( + input=input, + output=output, + state=state, + ) + return bolt + + +def test_generate_sentence_transformer_embeddings(embeddings_bulk_bolt, dataset_file): + directory, ext = dataset_file + embeddings_bulk_bolt.generate( + kind="sentence", + model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", + use_cuda=False, + ) + files = glob.glob(f"{embeddings_bulk_bolt.output.output_folder}/embeddings-*.json") + assert len(files) > 0 + + +def test_generate_huggingface_embeddings(embeddings_bulk_bolt, dataset_file): + directory, ext = dataset_file + for kind in ["sentence_windows", "sentence_combinations", "sentence_permutations"]: + embeddings_bulk_bolt.generate(kind=kind, model_name="bert-base-uncased", use_cuda=True, device_map="cuda:0") + files = glob.glob(f"{embeddings_bulk_bolt.output.output_folder}/embeddings-*.json") + assert len(files) > 0 diff --git a/geniusrise_text/embeddings/tests/test_embeddings.py b/geniusrise_text/embeddings/tests/test_embeddings.py new file mode 100644 index 0000000..0f87867 --- /dev/null +++ b/geniusrise_text/embeddings/tests/test_embeddings.py @@ -0,0 +1,94 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from sentence_transformers import SentenceTransformer +from transformers import AutoModel, AutoTokenizer + +from geniusrise_text.embeddings.embeddings import ( + generate_combination_embeddings, + generate_contiguous_embeddings, + generate_embeddings, + generate_permutation_embeddings, + generate_sentence_transformer_embeddings, +) + +# List of models to test +MODEL_NAMES = [ + "bert-base-uncased", + "gpt2", + "intfloat/multilingual-e5-base", + "NeuML/pubmedbert-base-embeddings", + "thenlper/gte-large", +] + +SENTENCE_TRANSFORMERS_MODELS = [ + "sentence-transformers/all-MiniLM-L6-v2", + "sentence-transformers/all-mpnet-base-v2", + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + "sentence-transformers/LaBSE", + "sentence-transformers/clip-ViT-B-32-multilingual-v1", + "sentence-transformers/paraphrase-xlm-r-multilingual-v1", +] + + +@pytest.mark.parametrize("model_name", SENTENCE_TRANSFORMERS_MODELS) +def test_generate_sentence_transformer_embeddings(model_name): + model = SentenceTransformer(model_name, device="cuda") + _model = AutoModel.from_pretrained(model_name) + sentences = ["This is a test sentence.", "Another test sentence."] + embeddings = generate_sentence_transformer_embeddings(sentences=sentences, model=model) + assert all( + [ + x.shape[0] == _model.config.hidden_size or x.shape[0] == _model.config.max_position_embeddings + for x in embeddings + ] + ) + + +@pytest.mark.parametrize("model_name", MODEL_NAMES) +def test_generate_embeddings(model_name): + model = AutoModel.from_pretrained(model_name).to("cuda:0") + tokenizer = AutoTokenizer.from_pretrained(model_name) + sentence = "This is a test sentence." + embeddings = generate_embeddings(sentence=sentence, model=model, tokenizer=tokenizer) + assert embeddings.shape == (1, model.config.hidden_size) + + +@pytest.mark.parametrize("model_name", MODEL_NAMES) +def test_generate_contiguous_embeddings(model_name): + model = AutoModel.from_pretrained(model_name).to("cuda:0") + tokenizer = AutoTokenizer.from_pretrained(model_name) + sentence = "This is a test sentence." + embeddings_list = generate_contiguous_embeddings(sentence=sentence, model=model, tokenizer=tokenizer) + assert all(embeddings.shape == (1, model.config.hidden_size) for embeddings, _ in embeddings_list) + + +@pytest.mark.parametrize("model_name", MODEL_NAMES) +def test_generate_combination_embeddings(model_name): + model = AutoModel.from_pretrained(model_name).to("cuda:0") + tokenizer = AutoTokenizer.from_pretrained(model_name) + sentence = "This is a test sentence." + embeddings_list = generate_combination_embeddings(sentence=sentence, model=model, tokenizer=tokenizer) + assert all(embeddings.shape == (1, model.config.hidden_size) for embeddings, _ in embeddings_list) + + +@pytest.mark.parametrize("model_name", MODEL_NAMES) +def test_generate_permutation_embeddings(model_name): + model = AutoModel.from_pretrained(model_name).to("cuda:0") + tokenizer = AutoTokenizer.from_pretrained(model_name) + sentence = "This is a test sentence." + embeddings_list = generate_permutation_embeddings(sentence=sentence, model=model, tokenizer=tokenizer) + assert all(embeddings.shape == (1, model.config.hidden_size) for embeddings, _ in embeddings_list) diff --git a/geniusrise_text/instruction/__init__.py b/geniusrise_text/instruction/__init__.py new file mode 100644 index 0000000..7bb2c55 --- /dev/null +++ b/geniusrise_text/instruction/__init__.py @@ -0,0 +1,18 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import InstructionAPI +from .bulk import InstructionBulk +from .fine_tune import InstructionFineTuner diff --git a/geniusrise_text/instruction/api.py b/geniusrise_text/instruction/api.py new file mode 100644 index 0000000..d86df34 --- /dev/null +++ b/geniusrise_text/instruction/api.py @@ -0,0 +1,481 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, Optional, Iterator + +import asyncio +import cherrypy +import llama_cpp +from concurrent.futures import ThreadPoolExecutor +from geniusrise_text.base import TextAPI +from geniusrise import BatchInput, BatchOutput, State +from geniusrise.logging import setup_logger +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.protocol import ChatCompletionRequest + + +class InstructionAPI(TextAPI): + r""" + InstructionAPI is designed for generating text based on prompts using instruction-tuned language models. + It serves as an interface to Hugging Face's pre-trained instruction-tuned models, providing a flexible API + for various text generation tasks. It can be used in scenarios ranging from generating creative content to + providing instructions or answers based on the prompts. + + Attributes: + model (Any): The loaded instruction-tuned language model. + tokenizer (Any): The tokenizer for processing text suitable for the model. + + Methods: + complete(**kwargs: Any) -> Dict[str, Any]: + Generates text based on the given prompt and decoding strategy. + + listen(**model_args: Any) -> None: + Starts a server to listen for text generation requests. + + CLI Usage Example: + ```bash + genius InstructionAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + listen \ + --args \ + model_name="TheBloke/Mistral-7B-OpenOrca-AWQ" \ + model_class="AutoModelForCausalLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float16" \ + quantization=0 \ + device_map="auto" \ + max_memory=None \ + torchscript=False \ + awq_enabled=True \ + flash_attention=True \ + endpoint="*" \ + port=3001 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + + Or using VLLM: + ```bash + genius InstructionAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id mistralai/Mistral-7B-Instruct-v0.1 \ + listen \ + --args \ + model_name="mistralai/Mistral-7B-Instruct-v0.1" \ + model_class="AutoModelForCausalLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="bfloat16" \ + quantization=0 \ + device_map="auto" \ + max_memory=None \ + torchscript=False \ + use_vllm=True \ + vllm_enforce_eager=True \ + vllm_max_model_len=1024 \ + concurrent_queries=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + + or using llama.cpp: + ```bash + genius InstructionAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + listen \ + --args \ + model_name="TheBloke/Mistral-7B-Instruct-v0.2-GGUF" \ + model_class="AutoModelForCausalLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + use_llama_cpp=True \ + llama_cpp_filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf" \ + llama_cpp_n_gpu_layers=35 \ + llama_cpp_n_ctx=32768 \ + concurrent_queries=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + """ + + model: Any + tokenizer: Any + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs: Any, + ): + """ + Initializes a new instance of the InstructionAPI class, setting up the necessary configurations + for input, output, and state. + + Args: + input (BatchInput): Configuration for the input data. + output (BatchOutput): Configuration for the output data. + state (State): The state of the API. + **kwargs (Any): Additional keyword arguments for extended functionality. + """ + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + self.hf_pipeline = None + self.vllm_server: Optional[OpenAIServingChat] = None + self.event_loop: Any = None + self.executor = ThreadPoolExecutor(max_workers=4) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def complete(self, **kwargs: Any) -> Dict[str, Any]: + """ + Handles POST requests to generate text based on the given prompt and decoding strategy. It uses the pre-trained + model specified in the setup to generate a completion for the input prompt. + + Args: + **kwargs (Any): Arbitrary keyword arguments containing the 'prompt' and other parameters for text generation. + + Returns: + Dict[str, Any]: A dictionary containing the original prompt and the generated completion. + + Example CURL Requests: + ```bash + /usr/bin/curl -X POST localhost:3001/api/v1/complete \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "<|system|>\n<|end|>\n<|user|>\nHow do I sort a list in Python?<|end|>\n<|assistant|>", + "decoding_strategy": "generate", + "max_new_tokens": 100, + "do_sample": true, + "temperature": 0.7, + "top_k": 50, + "top_p": 0.95 + }' | jq + ``` + """ + data = cherrypy.request.json + prompt = data.get("prompt") + decoding_strategy = data.get("decoding_strategy", "generate") + + generation_params = data + if "decoding_strategy" in generation_params: + del generation_params["decoding_strategy"] + if "prompt" in generation_params: + del generation_params["prompt"] + + return { + "prompt": prompt, + "args": data, + "completion": self.generate(prompt=prompt, decoding_strategy=decoding_strategy, **generation_params), + } + + def initialize_pipeline(self): + """ + Lazy initialization of the Hugging Face pipeline for chat interaction. + """ + if not self.hf_pipeline: + model = AutoModelForCausalLM.from_pretrained(self.model_name) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if self.use_cuda: + model.cuda() + self.hf_pipeline = pipeline("conversational", model=model, tokenizer=tokenizer) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def chat(self, **kwargs: Any) -> Dict[str, Any]: + """ + Handles chat interaction using the Hugging Face pipeline. This method enables conversational text generation, + simulating a chat-like interaction based on user and system prompts. + + Args: + **kwargs (Any): Arbitrary keyword arguments containing 'user_prompt' and 'system_prompt'. + + Returns: + Dict[str, Any]: A dictionary containing the user prompt, system prompt, and chat interaction results. + + Example CURL Request for chat interaction: + ```bash + /usr/bin/curl -X POST localhost:3001/api/v1/chat \ + -H "Content-Type: application/json" \ + -d '{ + "user_prompt": "What is the capital of France?", + "system_prompt": "The capital of France is" + }' | jq + ``` + """ + self.initialize_pipeline() # Initialize the pipeline on first API hit + + data = cherrypy.request.json + user_prompt = data.get("user_prompt") + system_prompt = data.get("system_prompt") + + result = self.hf_pipeline(user_prompt, system_prompt) # type: ignore + + return {"user_prompt": user_prompt, "system_prompt": system_prompt, "result": result} + + def initialize_vllm(self, chat_template: str, response_role: str = "assistant"): + self.vllm_server = OpenAIServingChat( + engine=self.model, served_model=self.model_name, response_role=response_role, chat_template=chat_template + ) + self.event_loop = asyncio.new_event_loop() + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def chat_vllm(self, **kwargs: Any) -> Dict[str, Any]: + """ + Handles POST requests to generate chat completions using the VLLM (Versatile Language Learning Model) engine. + This method accepts various parameters for customizing the chat completion request, including message content, + generation settings, and more. + + Args: + messages (List[Dict[str, str]]): The chat messages for generating a response. Each message should include a 'role' (either 'user' or 'system') and 'content'. + temperature (float, optional): The sampling temperature. Defaults to 0.7. Higher values generate more random completions. + top_p (float, optional): The nucleus sampling probability. Defaults to 1.0. A smaller value leads to higher diversity. + n (int, optional): The number of completions to generate. Defaults to 1. + max_tokens (int, optional): The maximum number of tokens to generate. Controls the length of the generated response. + stop (Union[str, List[str]], optional): Sequence(s) where the generation should stop. Can be a single string or a list of strings. + stream (bool, optional): Whether to stream the response. Streaming may be useful for long completions. + presence_penalty (float, optional): Adjusts the likelihood of tokens based on their presence in the conversation so far. Defaults to 0.0. + frequency_penalty (float, optional): Adjusts the likelihood of tokens based on their frequency in the conversation so far. Defaults to 0.0. + logit_bias (Dict[str, float], optional): Adjustments to the logits of specified tokens, identified by token IDs as keys and adjustment values as values. + user (str, optional): An identifier for the user making the request. Can be used for logging or customization. + best_of (int, optional): Generates 'n' completions server-side and returns the best one. Higher values incur more computation cost. + top_k (int, optional): Filters the generated tokens to the top-k tokens with the highest probabilities. Defaults to -1, which disables top-k filtering. + ignore_eos (bool, optional): Whether to ignore the end-of-sentence token in generation. Useful for more fluid continuations. + use_beam_search (bool, optional): Whether to use beam search instead of sampling for generation. Beam search can produce more coherent results. + stop_token_ids (List[int], optional): List of token IDs that should cause generation to stop. + skip_special_tokens (bool, optional): Whether to skip special tokens (like padding or end-of-sequence tokens) in the output. + spaces_between_special_tokens (bool, optional): Whether to insert spaces between special tokens in the output. + add_generation_prompt (bool, optional): Whether to prepend the generation prompt to the output. + echo (bool, optional): Whether to include the input prompt in the output. + repetition_penalty (float, optional): Penalty applied to tokens that have been generated previously. Defaults to 1.0, which applies no penalty. + min_p (float, optional): Sets a minimum threshold for token probabilities. Tokens with probabilities below this threshold are filtered out. + include_stop_str_in_output (bool, optional): Whether to include the stop string(s) in the output. + length_penalty (float, optional): Exponential penalty to the length for beam search. Only relevant if use_beam_search is True. + + Returns: + Dict[str, Any]: A dictionary with the chat completion response or an error message. + + Example CURL Request: + ```bash + curl -X POST "http://localhost:3000/api/v1/chat_vllm" \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "user", "content": "Whats the weather like in London?"} + ], + "temperature": 0.7, + "top_p": 1.0, + "n": 1, + "max_tokens": 50, + "stream": false, + "presence_penalty": 0.0, + "frequency_penalty": 0.0, + "logit_bias": {}, + "user": "example_user" + }' + ``` + This request asks the VLLM engine to generate a completion for the provided chat context, with specified generation settings. + """ + # Extract data from the POST request + data = cherrypy.request.json + response_role = data.get("response_role", "assistant") + chat_template = data.get("chat_template", None) + + # Initialize VLLM server with chat template and response role if not already initialized + if not hasattr(self, "vllm_server") or self.vllm_server is None: + self.initialize_vllm(chat_template=chat_template, response_role=response_role) + + # Prepare the chat completion request + chat_request = ChatCompletionRequest( + model=self.model_name, + messages=data.get("messages"), + temperature=data.get("temperature", 0.7), + top_p=data.get("top_p", 1.0), + n=data.get("n", 1), + max_tokens=data.get("max_tokens"), + stop=data.get("stop", []), + stream=data.get("stream", False), + presence_penalty=data.get("presence_penalty", 0.0), + frequency_penalty=data.get("frequency_penalty", 0.0), + logit_bias=data.get("logit_bias", {}), + user=data.get("user"), + best_of=data.get("best_of"), + top_k=data.get("top_k", -1), + ignore_eos=data.get("ignore_eos", False), + use_beam_search=data.get("use_beam_search", False), + stop_token_ids=data.get("stop_token_ids", []), + skip_special_tokens=data.get("skip_special_tokens", True), + spaces_between_special_tokens=data.get("spaces_between_special_tokens", True), + add_generation_prompt=data.get("add_generation_prompt", True), + echo=data.get("echo", False), + repetition_penalty=data.get("repetition_penalty", 1.0), + min_p=data.get("min_p", 0.0), + include_stop_str_in_output=data.get("include_stop_str_in_output", False), + length_penalty=data.get("length_penalty", 1.0), + ) + + # Generate chat completion using the VLLM engine + try: + + class DummyObject: + async def is_disconnected(self): + return False + + async def async_call(): + response = await self.vllm_server.create_chat_completion( + request=chat_request, raw_request=DummyObject() + ) + return response + + chat_completion = asyncio.run(async_call()) + + return chat_completion.model_dump() if chat_completion else {"error": "Failed to generate chat completion"} + except Exception as e: + self.log.exception("Error generating chat completion: %s", str(e)) + raise e + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def chat_llama_cpp(self, **kwargs: Any) -> Dict[str, Any]: + """ + Handles POST requests to generate chat completions using the llama.cpp engine. This method accepts various + parameters for customizing the chat completion request, including messages, sampling settings, and more. + + Args: + messages (List[Dict[str, str]]): The chat messages for generating a response. + functions (Optional[List[Dict]]): A list of functions to use for the chat completion (advanced usage). + function_call (Optional[Dict]): A function call to use for the chat completion (advanced usage). + tools (Optional[List[Dict]]): A list of tools to use for the chat completion (advanced usage). + tool_choice (Optional[Dict]): A tool choice option for the chat completion (advanced usage). + temperature (float): The temperature to use for sampling, controlling randomness. + top_p (float): The nucleus sampling's top-p parameter, controlling diversity. + top_k (int): The top-k sampling parameter, limiting the token selection pool. + min_p (float): The minimum probability threshold for sampling. + typical_p (float): The typical-p parameter for locally typical sampling. + stream (bool): Flag to stream the results. + stop (Optional[Union[str, List[str]]]): Tokens or sequences where generation should stop. + seed (Optional[int]): Seed for random number generation to ensure reproducibility. + response_format (Optional[Dict]): Specifies the format of the generated response. + max_tokens (Optional[int]): Maximum number of tokens to generate. + presence_penalty (float): Penalty for token presence to discourage repetition. + frequency_penalty (float): Penalty for token frequency to discourage common tokens. + repeat_penalty (float): Penalty applied to tokens that are repeated. + tfs_z (float): Tail-free sampling parameter to adjust the likelihood of tail tokens. + mirostat_mode (int): Mirostat sampling mode for dynamic adjustments. + mirostat_tau (float): Tau parameter for mirostat sampling, controlling deviation. + mirostat_eta (float): Eta parameter for mirostat sampling, controlling adjustment speed. + model (Optional[str]): Specifies the model to use for generation. + logits_processor (Optional[List]): List of logits processors for advanced generation control. + grammar (Optional[Dict]): Specifies grammar rules for the generated text. + logit_bias (Optional[Dict[str, float]]): Adjustments to the logits of specified tokens. + logprobs (Optional[bool]): Whether to include log probabilities in the output. + top_logprobs (Optional[int]): Number of top log probabilities to include. + + Returns: + Dict[str, Any]: A dictionary containing the chat completion response or an error message. + + Example CURL Request: + ```bash + curl -X POST "http://localhost:3000/api/v1/chat_llama_cpp" \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "user", "content": "What is the capital of France?"}, + {"role": "system", "content": "The capital of France is"} + ], + "temperature": 0.2, + "top_p": 0.95, + "top_k": 40, + "max_tokens": 50, + }' + ``` + """ + # Ensure llama.cpp model and necessary configurations are loaded and initialized + if not self.model or not isinstance(self.model, llama_cpp.Llama): + raise ValueError( + "llama.cpp model is not initialized. Please initialize the model before using chat_llama_cpp." + ) + + # Extract data from the POST request + data = cherrypy.request.json + + # Convert the request data to the format expected by llama.cpp's create_chat_completion method + try: + response = self.model.create_chat_completion( + messages=data.get("messages", []), + functions=data.get("functions"), + function_call=data.get("function_call"), + tools=data.get("tools"), + tool_choice=data.get("tool_choice"), + temperature=data.get("temperature", 0.2), + top_p=data.get("top_p", 0.95), + top_k=data.get("top_k", 40), + min_p=data.get("min_p", 0.05), + typical_p=data.get("typical_p", 1.0), + stream=data.get("stream", False), + stop=data.get("stop", []), + seed=data.get("seed"), + response_format=data.get("response_format"), + max_tokens=data.get("max_tokens"), + presence_penalty=data.get("presence_penalty", 0.0), + frequency_penalty=data.get("frequency_penalty", 0.0), + repeat_penalty=data.get("repeat_penalty", 1.1), + tfs_z=data.get("tfs_z", 1.0), + mirostat_mode=data.get("mirostat_mode", 0), + mirostat_tau=data.get("mirostat_tau", 5.0), + mirostat_eta=data.get("mirostat_eta", 0.1), + model=data.get("model"), + logits_processor=data.get("logits_processor"), + grammar=data.get("grammar"), + logit_bias=data.get("logit_bias"), + logprobs=data.get("logprobs"), + top_logprobs=data.get("top_logprobs"), + ) + except Exception as e: + self.log.exception("Error generating chat completion using llama.cpp: %s", str(e)) + return {"error": str(e)} + + # Return the generated chat completion or stream of completions + return response if not isinstance(response, Iterator) else list(response) diff --git a/geniusrise_text/instruction/api.yml b/geniusrise_text/instruction/api.yml new file mode 100644 index 0000000..f5ee25d --- /dev/null +++ b/geniusrise_text/instruction/api.yml @@ -0,0 +1,113 @@ +openapi: 3.0.0 +info: + title: GeniusRise Instruction-Based Text Processing API + description: API for generating text based on instructions using Hugging Face models. + version: "1.0" +servers: + - url: http://localhost:3001/api/v1 + description: Development server for Instruction-based APIs +paths: + /complete: + post: + summary: Generate text based on a given prompt and optional parameters + operationId: generateTextComplete + tags: + - Text Generation + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + prompt: + type: string + description: The prompt to generate text for. + decoding_strategy: + type: string + description: The decoding strategy to use, e.g., "generate". + default: "generate" + max_new_tokens: + type: integer + description: Maximum number of new tokens to generate. + do_sample: + type: boolean + description: Whether to use sampling for generation. + temperature: + type: number + description: Temperature for sampling. + top_k: + type: integer + description: Number of highest probability tokens to keep for top-k-filtering. + top_p: + type: number + description: Cumulative probability for top-p-filtering. + required: + - prompt + responses: + 200: + description: Successfully generated text + content: + application/json: + schema: + type: object + properties: + prompt: + type: string + args: + type: object + additionalProperties: true + completion: + type: string + 400: + description: Invalid request + 500: + description: Error during text generation + + /chat: + post: + summary: Handles chat interaction using conversational text generation + operationId: chatInteraction + tags: + - Chat Interaction + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + user_prompt: + type: string + description: The user's part of the conversation. + system_prompt: + type: string + description: The system's part of the conversation to respond to user's prompt. + required: + - user_prompt + - system_prompt + responses: + 200: + description: Successfully handled chat interaction + content: + application/json: + schema: + type: object + properties: + user_prompt: + type: string + system_prompt: + type: string + result: + type: array + items: + type: object + properties: + text: + type: string + score: + type: number + 400: + description: Invalid request + 500: + description: Error during chat interaction diff --git a/geniusrise_text/instruction/bulk.py b/geniusrise_text/instruction/bulk.py new file mode 100644 index 0000000..dde4b80 --- /dev/null +++ b/geniusrise_text/instruction/bulk.py @@ -0,0 +1,793 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional, Union + +import pandas as pd +import llama_cpp +from transformers.tokenization_utils_base import PreTrainedTokenizerBase +import yaml # type: ignore +from datasets import Dataset, load_from_disk +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather +from pyarrow import parquet as pq +from vllm import LLM, SamplingParams + +from geniusrise_text.base import TextBulk + + +class InstructionBulk(TextBulk): + r""" + InstructionBulk is a class designed to perform bulk text generation tasks using Hugging Face's instruction-tuned language models. + It is optimized for large-scale text generation, providing an efficient interface to use state-of-the-art machine learning + models for generating text based on a set of instructions or prompts. + + Attributes: + model (Any): The loaded, pre-trained instruction-tuned language model. + tokenizer (Any): The tokenizer for processing text compatible with the model. + + Methods: + load_dataset(dataset_path: str, max_length: int = 1024, **kwargs) -> Optional[Dataset]: + Loads a dataset for text generation tasks from the specified directory. + + perform(model_name: str, **kwargs: Any) -> None: + Performs bulk text generation using the specified model and tokenizer. + + Example CLI Usage: + ```bash + genius InstructionBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/chat \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/chat \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id mistralai/Mistral-7B-Instruct-v0.1-lol \ + perform \ + --args \ + model_name="mistralai/Mistral-7B-Instruct-v0.1" \ + model_class="AutoModelForCausalLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="bfloat16" \ + quantization=0 \ + device_map="auto" \ + max_memory=None \ + torchscript=False \ + decoding_strategy="generate" \ + generation_max_new_tokens=100 \ + generation_do_sample=true + ``` + + or using VLLM: + ```bash + genius InstructionBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/chat \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/chat \ + none \ + --id mistralai/Mistral-7B-Instruct-v0.1 \ + perform_vllm \ + --args \ + model_name="mistralai/Mistral-7B-Instruct-v0.1" \ + use_cuda=True \ + precision="bfloat16" \ + quantization=0 \ + device_map="auto" \ + generation_temperature=0.7 \ + generation_top_p=1.0 \ + generation_n=1 \ + generation_max_tokens=50 \ + generation_stream=false \ + generation_presence_penalty=0.0 \ + generation_frequency_penalty=0.0 + ``` + + or using llama.cpp: + ```bash + genius InstructionBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/chat \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/chat \ + none \ + --id mistralai/Mistral-7B-Instruct-v0.1 \ + perform_llama_cpp \ + --args \ + model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF" \ + filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf" \ + n_gpu_layers=35 \ + generation_temperature=0.7 \ + generation_top_p=0.95 \ + generation_top_k=40 \ + generation_max_tokens=50 \ + generation_repeat_penalty=0.1 + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + """ + Initializes the InstructionBulk class with input, output, and state configurations for bulk text generation. + + Args: + input (BatchInput): Configuration for input data handling. + output (BatchOutput): Configuration for output data handling. + state (State): State management for the text generation task. + **kwargs: Additional keyword arguments for extended functionalities. + """ + super().__init__(input, output, state, **kwargs) + + def load_dataset(self, dataset_path: str, max_length: int = 1024, **kwargs) -> Optional[Dataset]: + r""" + Loads a dataset from the specified path. This method supports various data formats including JSON, CSV, Parquet, + and others. It's designed to facilitate the bulk processing of text data for generation tasks. + + Args: + dataset_path (str): Path to the directory containing the dataset files. + max_length (int): Maximum token length for text processing (default is 1024). + **kwargs: Additional keyword arguments for dataset loading. + + Returns: + Optional[Dataset]: A Dataset object if loading is successful; otherwise, None. + + Raises: + Exception: If an error occurs during dataset loading. + + ## Supported Data Formats and Structures: + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"instruction": "The instruction"} + ``` + + ### CSV + Should contain 'instruction' columns. + ```csv + instruction + "The instruction" + ``` + + ### Parquet + Should contain 'instruction' columns. + + ### JSON + An array of dictionaries with 'instruction' keys. + ```json + [{"instruction": "The instruction"}] + ``` + + ### XML + Each 'record' element should contain 'instruction' child elements. + ```xml + + The instruction + + ``` + + ### YAML + Each document should be a dictionary with 'instruction' keys. + ```yaml + - instruction: "The instruction" + ``` + + ### TSV + Should contain 'instruction' columns separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'instruction' columns. + + ### SQLite (.db) + Should contain a table with 'instruction' columns. + + ### Feather + Should contain 'instruction' columns. + """ + try: + self.log.info(f"Loading dataset from {dataset_path}") + self.max_length = max_length + if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + # Load dataset saved by Hugging Face datasets library + return load_from_disk(dataset_path) + else: + data = [] + for filename in glob.glob(f"{dataset_path}/**/*", recursive=True): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + instruction = record.find("instruction").text # type: ignore + data.append({"instruction": instruction}) + + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT instruction FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + data = [fn(d) for d in data] + else: + data = data + + dataset = Dataset.from_pandas(pd.DataFrame(data)) + return dataset + except Exception as e: + self.log.error(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def perform( + self, + model_name: str, + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + decoding_strategy: str = "generate", + notification_email: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Performs text generation in bulk using a specified instruction-tuned model. This method handles the entire + process, including model loading, prompt processing, text generation, and saving the results. + + Args: + model_name (str): The name or path of the instruction-tuned model. + model_class (str, optional): The class of the language model. Defaults to "AutoModelForCausalLM". + tokenizer_class (str, optional): The class of the tokenizer. Defaults to "AutoTokenizer". + use_cuda (bool, optional): Whether to use CUDA for model inference. Defaults to False. + precision (str, optional): Precision for model computation. Defaults to "float16". + quantization (int, optional): Level of quantization for optimizing model size and speed. Defaults to 0. + device_map (str | Dict | None, optional): Specific device to use for computation. Defaults to "auto". + max_memory (Dict, optional): Maximum memory configuration for devices. Defaults to {0: "24GB"}. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool, optional): Whether to enable AWQ optimization. Defaults to False. + flash_attention (bool, optional): Whether to use flash attention optimization. Defaults to False. + decoding_strategy (str, optional): Strategy for decoding the completion. Defaults to "generate". + **kwargs: Configuration and additional arguments for text generation such as model class, tokenizer class, + precision, device map, and other generation-related parameters. + + Note: + Additional arguments are passed directly to the model and tokenizer initialization and the generation method. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.notification_email = notification_email + self.compile = compile + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["instruction"] + + prompts = [] + completions = [] + + for _, prompt in enumerate(dataset): + completion = self.generate( + prompt=prompt, + decoding_strategy=decoding_strategy, + **generation_args, + ) + completions.append(completion) + prompts.append(prompt) + + self._save_completions(completions, prompts, output_path) + self.done() + + def perform_vllm( + self, + model_name: str, + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + # VLLM params + vllm_tokenizer_mode: str = "auto", + vllm_download_dir: Optional[str] = None, + vllm_load_format: str = "auto", + vllm_seed: int = 42, + vllm_max_model_len: int = 1024, + vllm_enforce_eager: bool = False, + vllm_max_context_len_to_capture: int = 8192, + vllm_block_size: int = 16, + vllm_gpu_memory_utilization: float = 0.90, + vllm_swap_space: int = 4, + vllm_sliding_window: Optional[int] = None, + vllm_pipeline_parallel_size: int = 1, + vllm_tensor_parallel_size: int = 1, + vllm_worker_use_ray: bool = False, + vllm_max_parallel_loading_workers: Optional[int] = None, + vllm_disable_custom_all_reduce: bool = False, + vllm_max_num_batched_tokens: Optional[int] = None, + vllm_max_num_seqs: int = 64, + vllm_max_paddings: int = 512, + vllm_max_lora_rank: Optional[int] = None, + vllm_max_loras: Optional[int] = None, + vllm_max_cpu_loras: Optional[int] = None, + vllm_lora_extra_vocab_size: int = 0, + vllm_placement_group: Optional[dict] = None, + vllm_log_stats: bool = False, + # Generate params + notification_email: Optional[str] = None, + batch_size: int = 32, + **kwargs: Any, + ) -> None: + """ + Performs bulk text generation using the Versatile Language Learning Model (VLLM) with specified parameters + for fine-tuning model behavior, including quantization and parallel processing settings. This method is designed + to process large datasets efficiently by leveraging VLLM capabilities for generating high-quality text completions + based on provided prompts. + + Args: + model_name (str): The name or path of the VLLM model to use for text generation. + use_cuda (bool): Flag indicating whether to use CUDA for GPU acceleration. + precision (str): Precision of computations, can be "float16", "bfloat16", etc. + quantization (int): Level of quantization for model weights, 0 for none. + device_map (str | Dict | None): Specific device(s) to use for model inference. + vllm_tokenizer_mode (str): Mode of the tokenizer ("auto", "fast", or "slow"). + vllm_download_dir (Optional[str]): Directory to download and load the model and tokenizer. + vllm_load_format (str): Format to load the model, e.g., "auto", "pt". + vllm_seed (int): Seed for random number generation. + vllm_max_model_len (int): Maximum sequence length the model can handle. + vllm_enforce_eager (bool): Enforce eager execution instead of using optimization techniques. + vllm_max_context_len_to_capture (int): Maximum context length for CUDA graph capture. + vllm_block_size (int): Block size for caching mechanism. + vllm_gpu_memory_utilization (float): Fraction of GPU memory to use. + vllm_swap_space (int): Amount of swap space to use in GiB. + vllm_sliding_window (Optional[int]): Size of the sliding window for processing. + vllm_pipeline_parallel_size (int): Number of pipeline parallel groups. + vllm_tensor_parallel_size (int): Number of tensor parallel groups. + vllm_worker_use_ray (bool): Whether to use Ray for model workers. + vllm_max_parallel_loading_workers (Optional[int]): Maximum number of workers for parallel loading. + vllm_disable_custom_all_reduce (bool): Disable custom all-reduce kernel and fall back to NCCL. + vllm_max_num_batched_tokens (Optional[int]): Maximum number of tokens to be processed in a single iteration. + vllm_max_num_seqs (int): Maximum number of sequences to be processed in a single iteration. + vllm_max_paddings (int): Maximum number of paddings to be added to a batch. + vllm_max_lora_rank (Optional[int]): Maximum rank for LoRA adjustments. + vllm_max_loras (Optional[int]): Maximum number of LoRA adjustments. + vllm_max_cpu_loras (Optional[int]): Maximum number of LoRA adjustments stored on CPU. + vllm_lora_extra_vocab_size (int): Additional vocabulary size for LoRA. + vllm_placement_group (Optional[dict]): Ray placement group for distributed execution. + vllm_log_stats (bool): Whether to log statistics during model operation. + notification_email (Optional[str]): Email to send notifications upon completion. + batch_size (int): Number of prompts to process in each batch for efficient memory usage. + **kwargs: Additional keyword arguments for generation settings like temperature, top_p, etc. + + This method automates the loading of large datasets, generation of text completions, and saving results, + facilitating efficient and scalable text generation tasks. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.notification_email = notification_email + + self.model: LLM = self.load_models_vllm( + model=model_name, + tokenizer=tokenizer_name, + tokenizer_mode=vllm_tokenizer_mode, + trust_remote_code=True, + download_dir=vllm_download_dir, + load_format=vllm_load_format, + dtype=self._get_torch_dtype(precision), + seed=vllm_seed, + revision=model_revision, + tokenizer_revision=tokenizer_revision, + max_model_len=vllm_max_model_len, + quantization=(None if quantization == 0 else f"{quantization}-bit"), + enforce_eager=vllm_enforce_eager, + max_context_len_to_capture=vllm_max_context_len_to_capture, + block_size=vllm_block_size, + gpu_memory_utilization=vllm_gpu_memory_utilization, + swap_space=vllm_swap_space, + cache_dtype="auto", + sliding_window=vllm_sliding_window, + pipeline_parallel_size=vllm_pipeline_parallel_size, + tensor_parallel_size=vllm_tensor_parallel_size, + worker_use_ray=vllm_worker_use_ray, + max_parallel_loading_workers=vllm_max_parallel_loading_workers, + disable_custom_all_reduce=vllm_disable_custom_all_reduce, + max_num_batched_tokens=vllm_max_num_batched_tokens, + max_num_seqs=vllm_max_num_seqs, + max_paddings=vllm_max_paddings, + device="cuda" if use_cuda else "cpu", + max_lora_rank=vllm_max_lora_rank, + max_loras=vllm_max_loras, + max_cpu_loras=vllm_max_cpu_loras, + lora_dtype=self._get_torch_dtype(precision), + lora_extra_vocab_size=vllm_lora_extra_vocab_size, + placement_group=vllm_placement_group, # type: ignore + log_stats=vllm_log_stats, + batched_inference=True, + ) + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["instruction"] + + for i in range(0, len(dataset), batch_size): + batch = dataset[i : i + batch_size] + + outputs = self.model.generate( + prompts=batch, + sampling_params=SamplingParams( + n=generation_args.get("n", 1), + best_of=generation_args.get("best_of", None), + presence_penalty=generation_args.get("presence_penalty", 0.0), + frequency_penalty=generation_args.get("frequency_penalty", 0.0), + repetition_penalty=generation_args.get("repetition_penalty", 1.0), + temperature=generation_args.get("temperature", 1.0), + top_p=generation_args.get("top_p", 1.0), + top_k=generation_args.get("top_k", -1), + min_p=generation_args.get("min_p", 0.0), + use_beam_search=generation_args.get("use_beam_search", False), + length_penalty=generation_args.get("length_penalty", 1.0), + early_stopping=generation_args.get("early_stopping", False), + stop=generation_args.get("stop", None), + stop_token_ids=generation_args.get("stop_token_ids", None), + include_stop_str_in_output=generation_args.get("include_stop_str_in_output", False), + ignore_eos=generation_args.get("ignore_eos", False), + max_tokens=generation_args.get("max_tokens", 16), + logprobs=generation_args.get("logprobs", None), + prompt_logprobs=generation_args.get("prompt_logprobs", None), + skip_special_tokens=generation_args.get("skip_special_tokens", True), + spaces_between_special_tokens=generation_args.get("spaces_between_special_tokens", True), + logits_processors=generation_args.get("logits_processors", None), + ), + ) + completions = [" ".join(t.text for t in o.outputs) for o in outputs] + self._save_completions(completions, batch, output_path) + self.done() + + def perform_llama_cpp( + self, + model: str, + filename: Optional[str] = None, + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + n_gpu_layers: int = 0, + split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER, + main_gpu: int = 0, + tensor_split: Optional[List[float]] = None, + vocab_only: bool = False, + use_mmap: bool = True, + use_mlock: bool = False, + kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None, + seed: int = llama_cpp.LLAMA_DEFAULT_SEED, + n_ctx: int = 512, + n_batch: int = 512, + n_threads: Optional[int] = None, + n_threads_batch: Optional[int] = None, + rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED, + rope_freq_base: float = 0.0, + rope_freq_scale: float = 0.0, + yarn_ext_factor: float = -1.0, + yarn_attn_factor: float = 1.0, + yarn_beta_fast: float = 32.0, + yarn_beta_slow: float = 1.0, + yarn_orig_ctx: int = 0, + mul_mat_q: bool = True, + logits_all: bool = False, + embedding: bool = False, + offload_kqv: bool = True, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_scale: float = 1.0, + lora_path: Optional[str] = None, + numa: Union[bool, int] = False, + chat_format: Optional[str] = None, + chat_handler: Optional[llama_cpp.llama_chat_format.LlamaChatCompletionHandler] = None, + draft_model: Optional[llama_cpp.LlamaDraftModel] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + verbose: bool = True, + notification_email: Optional[str] = None, + **kwargs, + ) -> None: + """ + Performs bulk text generation using the LLaMA model with llama.cpp backend. This method handles the entire + process, including model loading, prompt processing, text generation, and saving the results. + + Args: + model: Path or identifier for the LLaMA model. + filename: Optional filename or glob pattern to match the model file. + local_dir: Local directory to save the model files. + n_gpu_layers: Number of layers to offload to GPU. + split_mode: Split mode for distributing model across GPUs. + main_gpu: Main GPU index. + tensor_split: Configuration for tensor splitting across GPUs. + vocab_only: Whether to load only the vocabulary. + use_mmap: Use memory-mapped files for model loading. + use_mlock: Lock model data in RAM to prevent swapping. + kv_overrides: Key-value pairs for overriding model config. + seed: Seed for random number generation. + n_ctx: Number of context tokens for generation. + n_batch: Batch size for processing. + n_threads: Number of threads for generation. + n_threads_batch: Number of threads for batch processing. + rope_scaling_type: Scaling type for RoPE. + rope_freq_base: Base frequency for RoPE. + rope_freq_scale: Frequency scaling for RoPE. + yarn_ext_factor: YaRN extrapolation factor. + yarn_attn_factor: YaRN attention factor. + yarn_beta_fast: YaRN beta fast parameter. + yarn_beta_slow: YaRN beta slow parameter. + yarn_orig_ctx: Original context size for YaRN. + mul_mat_q: Multiply matrices for queries. + logits_all: Return logits for all tokens. + embedding: Enable embedding mode. + offload_kqv: Offload K, Q, V matrices to GPU. + last_n_tokens_size: Size for the last_n_tokens buffer. + lora_base: Base model path for LoRA. + lora_scale: Scale factor for LoRA adjustments. + lora_path: Path for LoRA adjustments. + numa: NUMA configuration. + chat_format: Chat format configuration. + chat_handler: Handler for chat completions. + draft_model: Draft model for speculative decoding. + tokenizer: Custom tokenizer instance. + verbose: Enable verbose logging. + notification_email (Optional[str]): Email to send notifications upon completion. + **kwargs: Additional arguments for model loading and text generation. + """ + self.notification_email = notification_email + + # Loading the LLaMA model with llama.cpp + llama_model, custom_tokenizer = self.load_models_llama_cpp( + model=model, + filename=filename, + local_dir=local_dir, + n_gpu_layers=n_gpu_layers, + split_mode=split_mode, + main_gpu=main_gpu, + tensor_split=tensor_split, + vocab_only=vocab_only, + use_mmap=use_mmap, + use_mlock=use_mlock, + kv_overrides=kv_overrides, + seed=seed, + n_ctx=n_ctx, + n_batch=n_batch, + n_threads=n_threads, + n_threads_batch=n_threads_batch, + rope_scaling_type=rope_scaling_type, + rope_freq_base=rope_freq_base, + rope_freq_scale=rope_freq_scale, + yarn_ext_factor=yarn_ext_factor, + yarn_attn_factor=yarn_attn_factor, + yarn_beta_fast=yarn_beta_fast, + yarn_beta_slow=yarn_beta_slow, + yarn_orig_ctx=yarn_orig_ctx, + mul_mat_q=mul_mat_q, + logits_all=logits_all, + embedding=embedding, + offload_kqv=offload_kqv, + last_n_tokens_size=last_n_tokens_size, + lora_base=lora_base, + lora_scale=lora_scale, + lora_path=lora_path, + numa=numa, + chat_format=chat_format, + chat_handler=chat_handler, + draft_model=draft_model, + tokenizer=tokenizer, + verbose=verbose, + **kwargs, + ) + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["instruction"] + + for i in range(0, len(dataset), n_batch): + batch = dataset[i : i + n_batch] + + # Generate completion for each prompt using llama_model + completions = llama_model.create_chat_completion( + messages=[{"content": prompt, "role": "user"} for prompt in batch], + functions=generation_args.get("functions"), + function_call=generation_args.get("function_call"), + tools=generation_args.get("tools"), + tool_choice=generation_args.get("tool_choice"), + temperature=generation_args.get("temperature", 0.2), + top_p=generation_args.get("top_p", 0.95), + top_k=generation_args.get("top_k", 40), + min_p=generation_args.get("min_p", 0.05), + typical_p=generation_args.get("typical_p", 1.0), + stream=generation_args.get("stream", False), + stop=generation_args.get("stop", []), + seed=generation_args.get("seed"), + response_format=generation_args.get("response_format"), + max_tokens=generation_args.get("max_tokens"), + presence_penalty=generation_args.get("presence_penalty", 0.0), + frequency_penalty=generation_args.get("frequency_penalty", 0.0), + repeat_penalty=generation_args.get("repeat_penalty", 1.1), + tfs_z=generation_args.get("tfs_z", 1.0), + mirostat_mode=generation_args.get("mirostat_mode", 0), + mirostat_tau=generation_args.get("mirostat_tau", 5.0), + mirostat_eta=generation_args.get("mirostat_eta", 0.1), + model=generation_args.get("model"), + logits_processor=generation_args.get("logits_processor"), + grammar=generation_args.get("grammar"), + logit_bias=generation_args.get("logit_bias"), + logprobs=generation_args.get("logprobs"), + top_logprobs=generation_args.get("top_logprobs"), + ) + + self._save_completions(completions["choices"][0]["message"]["content"], batch, output_path) # type: ignore + self.done() + + def _save_completions(self, completions: List[str], prompts: List[str], output_path: str) -> None: + """ + Saves the generated texts alongside their prompts to the specified output path. This method ensures the results + of text generation are persisted for later use or analysis. + + Args: + completions (List[str]): The list of generated texts. + prompts (List[str]): The list of prompts corresponding to the generated texts. + output_path (str): The directory path to save the results. + """ + data_to_save = [ + {"prompt": prompt, "completion": completion} for prompt, completion in zip(prompts, completions) + ] + with open(os.path.join(output_path, f"completions-{str(uuid.uuid4())}.json"), "w") as f: + json.dump(data_to_save, f) diff --git a/huggingface/instruction_tuning.py b/geniusrise_text/instruction/fine_tune.py similarity index 67% rename from huggingface/instruction_tuning.py rename to geniusrise_text/instruction/fine_tune.py index 8106696..b4bdbcb 100644 --- a/huggingface/instruction_tuning.py +++ b/geniusrise_text/instruction/fine_tune.py @@ -1,22 +1,20 @@ # 🧠 Geniusrise # Copyright (C) 2023 geniusrise.ai # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import glob import json -import logging import os import sqlite3 import xml.etree.ElementTree as ET @@ -25,92 +23,52 @@ import numpy as np import pandas as pd import pyarrow.parquet as pq +import torch import yaml # type: ignore -from datasets import Dataset as HFDataset -from datasets import load_from_disk +from datasets import Dataset, load_dataset, load_from_disk from nltk.translate.bleu_score import corpus_bleu from pyarrow import feather from transformers import EvalPrediction -from .base import HuggingFaceFineTuner +from geniusrise_text.base import TextFineTuner -class HuggingFaceInstructionTuningFineTuner(HuggingFaceFineTuner): +class InstructionFineTuner(TextFineTuner): r""" A bolt for fine-tuning Hugging Face models on instruction tuning tasks. + This class inherits from `TextFineTuner` and specializes in fine-tuning models for instruction-based tasks. + It provides additional methods for loading and preparing datasets in various formats, as well as computing custom metrics. + Args: input (BatchInput): The batch input data. output (OutputConfig): The output data. state (State): The state manager. - ## Using geniusrise to invoke via command line - ```bash - genius HuggingFaceInstructionTuningFineTuner rise \ - streaming \ - --input_kafka_topic webhook_test \ - --input_kafka_cluster_connection_string localhost:9094 \ - --input_kafka_consumer_group_id geniusrise \ - streaming \ - --output_kafka_topic webhook_test \ - --output_kafka_cluster_connection_string localhost:9094 \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - listen \ - --args various=30 arguments=40 that=50 this=70 bolt=63 may=lol have='{"lol": "lel"}' - ``` + Attributes: + max_length (int): The maximum length for tokenization. + + CLI Usage: - ## Using geniusrise to invoke via YAML file - ```yaml - version: "1" - bolts: - my_instruction_bolt: - name: "HuggingFaceInstructionTuningFineTuner" - method: "listen" - args: - various: 30 - arguments: 40 - that: 50 - this: 70 - bolt: 63 - may: "lol" - have: '{"lol": "lel"}' - input: - type: "streaming" - args: - input_topic: "webhook_test" - kafka_servers: "localhost:9094" - group_id: "geniusrise" - output: - type: "streaming" - args: - output_topic: "webhook_test" - kafka_servers: "localhost:9094" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "geniusrise" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_instruction_bolt" - namespace: "default" - image: "my_instruction_bolt_image" - replicas: 1 + ```bash + genius InstructionFineTuner rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id mistralai/Mistral-7B-Instruct-v0.1-lol \ + fine_tune \ + --args \ + model_name=my_model \ + tokenizer_name=my_tokenizer \ + num_train_epochs=3 \ + per_device_train_batch_size=8 \ + data_max_length=512 ``` """ - def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs: Any) -> Union[HFDataset, Dict]: + def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs: Any) -> Union[Dataset, Dict]: r""" Load an instruction tuning dataset from a directory. @@ -178,12 +136,13 @@ def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs: Any) """ try: - logging.info(f"Loading dataset from {dataset_path}") + self.log.info(f"Loading dataset from {dataset_path}") self.max_length = max_length - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): # Load dataset saved by Hugging Face datasets library dataset = load_from_disk(dataset_path) - return dataset.map(self.prepare_train_features, batched=True) else: data = [] for filename in glob.glob(f"{dataset_path}/*"): @@ -238,10 +197,17 @@ def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs: Any) df = feather.read_feather(filepath) data.extend(df.to_dict("records")) - dataset = HFDataset.from_pandas(pd.DataFrame(data)) - return dataset.map(self.prepare_train_features, batched=True) + dataset = Dataset.from_pandas(pd.DataFrame(data)) + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + + return dataset.map(self.prepare_train_features, batched=True) except Exception as e: - logging.error(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + self.log.error(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") raise def prepare_train_features(self, examples: Dict) -> Dict: @@ -258,21 +224,45 @@ def prepare_train_features(self, examples: Dict) -> Dict: if not self.tokenizer: raise Exception("Tokenizer not initialized") - # Tokenize the examples - encoding = self.tokenizer( + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Tokenize the instructions and outputs separately + input_encoding = self.tokenizer( examples["instruction"], + truncation=True, + padding="max_length", + max_length=self.max_length // 2, + return_tensors="pt", + ) + output_encoding = self.tokenizer( examples["output"], truncation=True, padding="max_length", - max_length=self.max_length, + max_length=self.max_length // 2, return_tensors="pt", ) - encoding["labels"] = encoding["input_ids"].clone() # Assuming that 'output' is the labels + sep_token = self.tokenizer.sep_token_id if self.tokenizer.sep_token_id else self.tokenizer.eos_token_id + # Convert sep_token to a tensor, then expand dimensions + sep_token_tensor = torch.tensor([sep_token], dtype=torch.long).unsqueeze(0) + + input_ids = torch.cat([input_encoding["input_ids"], sep_token_tensor, output_encoding["input_ids"]], dim=1) + attention_mask = torch.cat( + [ + input_encoding["attention_mask"], + torch.tensor([[1]], dtype=torch.long), + output_encoding["attention_mask"], + ], + dim=1, + ) + + # Use the tokenized output as the labels + labels = output_encoding["input_ids"] - return encoding + return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} except Exception as e: - print(f"Error preparing train features: {e}") + self.log.exception(f"Error preparing train features: {e}") raise def compute_metrics(self, eval_pred: EvalPrediction) -> Optional[Dict[str, float]]: diff --git a/geniusrise_text/instruction/tests/__init__.py b/geniusrise_text/instruction/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/instruction/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/instruction/tests/test_bulk.py b/geniusrise_text/instruction/tests/test_bulk.py new file mode 100644 index 0000000..3d4e9a7 --- /dev/null +++ b/geniusrise_text/instruction/tests/test_bulk.py @@ -0,0 +1,254 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import itertools +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.instruction.bulk import InstructionBulk + + +@pytest.fixture( + params=[ + # model_name, model_class, tokenizer_class, use_cuda, precision, quantization, device_map, max_memory, torchscript + # fmt: off + ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", 0, None, None, False), + ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", False, "float32", 0, None, None, False), + ("bigscience/bloom-560m", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, None, None, False), + ("meta-llama/Llama-2-7b-hf", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, None, None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 8, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, "auto", None, True), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, "auto", None, True), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 8, "auto", None, True), + # mistral + ("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + ("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ:gptq-8bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + ("TheBloke/Mistral-7B-Code-16K-qlora-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + ("TheBloke/Mistral-7B-Phibrarian-32K-GPTQ", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + # zephyr + ("TheBloke/zephyr-7B-beta-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + ("TheBloke/zephyr-7B-beta-GPTQ:gptq-8bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + # falcon + # ("TheBloke/falcon-7b-instruct-GPTQ", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "auto", None, False), + # llama-based + ("TheBloke/Llama-2-7b-Chat-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", None, "cuda:0", None, False), + ("TheBloke/Llama-2-7B-32K-Instruct-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", None, "cuda:0", None, False), + ("TheBloke/WizardLM-7B-uncensored-GPTQ", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + ("TheBloke/vicuna-7B-v1.5-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + ("TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", None, "cuda:0", None, False), + ("TheBloke/Yarn-Llama-2-7B-64K-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", None, "cuda:0", None, False), + ("TheBloke/Yarn-Llama-2-7B-128K-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", None, "cuda:0", None, False), + # fmt: on + ] +) +def model_config(request): + return request.param + + +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"text": f"text_{i}"} for i in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "text").text = item["text"] + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "jsonl", + "parquet", + "json", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir, ext) + return tmpdir, ext + + +# Fixtures to initialize InstructionBulk instance +@pytest.fixture +def chatbot(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + + input = BatchInput(input_dir, "geniusrise-test", "api_input") + output = BatchOutput(output_dir, "geniusrise-test", "api_output") + state = InMemoryState() + + chatbot = InstructionBulk( + input=input, + output=output, + state=state, + ) + yield chatbot + + +# Define strategies and associated parameters +strategies = { + "generate": {}, + "greedy_search": {}, + "beam_search": {"num_beams": 4}, + "beam_sample": {"num_beams": 4, "temperature": 0.7, "top_k": 20}, + "group_beam_search": {"num_beams": 4, "num_beam_groups": 2}, +} + +# Define other parameters +length_params = { + "max_length": [20, 30], + "min_length": [0, 10], + "early_stopping": [False, True], +} +gen_strategy_params = { + "do_sample": [False, True], +} +logit_params = { + "temperature": [1.0, 0.7], + "top_k": [50, 20], + "top_p": [1.0, 0.9], + "repetition_penalty": [1.0, 1.5], + "length_penalty": [1.0, 0.5], + "no_repeat_ngram_size": [0, 2], +} +# Merge all the parameters into one dictionary for itertools.product +all_params = {**length_params, **gen_strategy_params, **logit_params} + + +@pytest.mark.parametrize("strategy", list(strategies.keys())) +def test_generate_strategies(chatbot, model_config, dataset_file, strategy): + ( + model_name, + model_class, + tokenizer_class, + use_cuda, + precision, + quantization, + device_map, + max_memory, + torchscript, + ) = model_config + + tmpdir, ext = dataset_file + chatbot.input.input_folder = tmpdir + + if ":" in model_name: + _model_name = model_name + model_revision = _model_name.split(":")[1] + model_name = _model_name.split(":")[0] + tokenizer_revision = _model_name.split(":")[1] + tokenizer_name = _model_name.split(":")[0] + else: + model_revision = None + tokenizer_revision = None + + # Strategy-specific params + strategy_params = strategies[strategy] + + # All possible combinations for the current strategy + param_combinations = [ + {**dict(zip(all_params.keys(), values)), **strategy_params} + for values in itertools.product(*all_params.values()) + ] + + if strategy != "generate" and ( + "32k" in model_name.lower() or "64k" in model_name.lower() or "128k" in model_name.lower() + ): + return + + for param_set in param_combinations: + param_set = {f"generation_{k}": v for k, v in param_set.items()} + + generated_text = chatbot.perform( + model_name=model_name, + model_revision=model_revision, + tokenizer_name=model_name, + tokenizer_revision=tokenizer_revision, + model_class=model_class, + tokenizer_class=tokenizer_class, + use_cuda=use_cuda, + precision=precision, + quantization=quantization, + device_map=device_map, + max_memory=max_memory, + torchscript=torchscript, + decoding_strategy=strategy, + model_trust_remote_code=True, + **param_set, # Unpack params into function arguments + ) + files = glob.glob(f"{chatbot.output.output_folder}/completions-*.json") + assert len(files) > 0 + break + + # Cleanup + del chatbot.model + del chatbot.tokenizer + torch.cuda.empty_cache() diff --git a/geniusrise_text/instruction/tests/test_fine_tune.py b/geniusrise_text/instruction/tests/test_fine_tune.py new file mode 100644 index 0000000..81a812d --- /dev/null +++ b/geniusrise_text/instruction/tests/test_fine_tune.py @@ -0,0 +1,330 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import numpy as np +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq +from transformers import EvalPrediction + +from geniusrise_text.instruction.fine_tune import InstructionFineTuner + +lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", +} + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"instruction": f"instruction_{i}", "output": f"output_{i}"} for i in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "instruction").text = item["instruction"] + ET.SubElement(record, "output").text = item["output"] + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "jsonl", + "parquet", + "json", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir + "/train", ext) + create_dataset_in_format(tmpdir + "/eval", ext) + return tmpdir, ext + + +MODELS_TO_TEST = { + # fmt: off + "small": "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ", + # fmt: on +} + + +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + +@pytest.fixture +def instruction_tuning_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + klass = InstructionFineTuner( + input=input, + output=output, + state=state, + ) + return klass + + +def test_instruction_tuning_bolt_init(instruction_tuning_bolt, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForCausalLM" + tokenizer_class = "AutoTokenizer" + + instruction_tuning_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + assert instruction_tuning_bolt.model is not None + assert instruction_tuning_bolt.tokenizer is not None + assert instruction_tuning_bolt.input is not None + assert instruction_tuning_bolt.output is not None + assert instruction_tuning_bolt.state is not None + + +def test_load_dataset_all_formats(instruction_tuning_bolt, dataset_file, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForCausalLM" + tokenizer_class = "AutoTokenizer" + + instruction_tuning_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + tmpdir, ext = dataset_file + dataset_path = os.path.join(tmpdir, "train") + dataset = instruction_tuning_bolt.load_dataset(dataset_path) + assert dataset is not None + assert len(dataset) == 10 + + +# Models to test +models = { + # fmt: off + "small": "PY007/TinyLlama-1.1B-Chat-v0.3", + "medium": "HuggingFaceH4/zephyr-7b-beta", + "large": "mistralai/Mistral-7B-Instruct-v0.1", + # mistral + "4-bit-mistral": "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ:gptq-4bit-32g-actorder_True", + "8-bit-mistral": "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ:gptq-8bit-32g-actorder_True", + "4-bit-mistral-code": "TheBloke/Mistral-7B-Code-16K-qlora-GPTQ:gptq-4bit-32g-actorder_True", + "4-bit-mistral-32k": "TheBloke/Mistral-7B-Phibrarian-32K-GPTQ", + # zephyr + "4-bit-zephyr": "TheBloke/zephyr-7B-beta-GPTQ:gptq-4bit-32g-actorder_True", + "8-bit-zephyr": "TheBloke/zephyr-7B-beta-GPTQ:gptq-8bit-32g-actorder_True", + # falcon + "4-bit-falcon": "TheBloke/falcon-7b-instruct-GPTQ", + # llama-based + "4-bit-llama-2": "TheBloke/Llama-2-7b-Chat-GPTQ:gptq-4bit-32g-actorder_True", + "4-bit-llama-2-32k": "TheBloke/Llama-2-7B-32K-Instruct-GPTQ:gptq-4bit-32g-actorder_True", + "4-bit-wizard": "TheBloke/WizardLM-7B-uncensored-GPTQ", + "4-bit-vicuna": "TheBloke/vicuna-7B-v1.5-GPTQ:gptq-4bit-32g-actorder_True", + "4-bit-wizard-vicuna": "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ", + "4-bit-yarn-64k": "TheBloke/Yarn-Llama-2-7B-64K-GPTQ:gptq-4bit-32g-actorder_True", + "4-bit-yarn-128k": "TheBloke/Yarn-Llama-2-7B-128K-GPTQ:gptq-4bit-32g-actorder_True", + # fmt: on +} + + +# Test for fine-tuning +@pytest.mark.parametrize( + "model_name, precision, quantization, lora_config, use_accelerate", + [ + # small + (models["small"], "float16", None, None, False), + (models["small"], "float16", None, None, True), + (models["small"], "float16", None, lora_config, False), + (models["small"], "float16", None, lora_config, True), + (models["small"], "bfloat16", None, None, False), + (models["small"], "bfloat16", None, None, True), + (models["small"], "bfloat16", None, lora_config, False), + (models["small"], "bfloat16", None, lora_config, True), + # small - 4bit + (models["small"], "float16", 4, lora_config, False), + (models["small"], "float16", 4, lora_config, True), + (models["small"], "bfloat16", 4, lora_config, False), + (models["small"], "bfloat16", 4, lora_config, True), + # small - 8 bit + (models["small"], "float16", 8, lora_config, False), + (models["small"], "float16", 8, lora_config, True), + (models["small"], "float32", 8, lora_config, False), + (models["small"], "float32", 8, lora_config, True), + (models["small"], "bfloat16", 8, lora_config, False), + (models["small"], "bfloat16", 8, lora_config, True), + # large + (models["large"], "bfloat16", 4, lora_config, False), + (models["large"], "bfloat16", 4, lora_config, True), + (models["large"], "float16", 4, lora_config, False), + (models["large"], "float16", 4, lora_config, True), + (models["large"], "float32", 4, lora_config, False), + (models["large"], "float32", 4, lora_config, True), + # # 4 bit + (models["4-bit-mistral"], "float16", None, lora_config, False), + (models["4-bit-mistral-code"], "float16", None, lora_config, False), + (models["4-bit-mistral-32k"], "float16", None, lora_config, False), + (models["4-bit-zephyr"], "float16", None, lora_config, False), + (models["4-bit-falcon"], "float16", None, lora_config, False), + (models["4-bit-llama-2"], "float16", None, lora_config, False), + (models["4-bit-llama-2-32k"], "float16", None, lora_config, False), + (models["4-bit-wizard"], "float16", None, lora_config, False), + (models["4-bit-vicuna"], "float16", None, lora_config, False), + (models["4-bit-wizard-vicuna"], "float16", None, lora_config, False), + (models["4-bit-yarn-64k"], "float16", None, lora_config, False), + (models["4-bit-yarn-128k"], "float16", None, lora_config, False), + # # 8 bit + (models["8-bit-mistral"], "float16", None, lora_config, False), + (models["8-bit-zephyr"], "float16", None, lora_config, False), + ], +) +def test_instruction_tuning_bolt_fine_tune( + instruction_tuning_bolt, dataset_file, model_name, precision, quantization, lora_config, use_accelerate +): + try: + tokenizer_name = model_name + + tmpdir, ext = dataset_file + instruction_tuning_bolt.input.input_folder = tmpdir + + instruction_tuning_bolt.fine_tune( + model_name=model_name, + tokenizer_name=model_name, + model_class="AutoModelForCausalLM", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map="auto" if "GPTQ" in model_name else None, + data_masked=False, + ) + output_dir = instruction_tuning_bolt.output.output_folder + assert os.path.exists( + os.path.join(instruction_tuning_bolt.output.output_folder, "model", "pytorch_model.bin") + ) or os.path.exists(os.path.join(instruction_tuning_bolt.output.output_folder, "model", "adapter_model.bin")) + assert os.path.exists( + os.path.join(instruction_tuning_bolt.output.output_folder, "model", "config.json") + ) or os.path.exists(os.path.join(instruction_tuning_bolt.output.output_folder, "model", "adapter_config.json")) + assert os.path.exists(os.path.join(instruction_tuning_bolt.output.output_folder, "model", "training_args.bin")) + + del instruction_tuning_bolt.model + del instruction_tuning_bolt.tokenizer + torch.cuda.empty_cache() + + try: + os.remove(os.path.join(instruction_tuning_bolt.output.output_folder, "model", "pytorch_model.bin")) + os.remove(os.path.join(instruction_tuning_bolt.output.output_folder, "model", "adapter_model.bin")) + os.remove(os.path.join(instruction_tuning_bolt.output.output_folder, "model", "config.json")) + os.remove(os.path.join(instruction_tuning_bolt.output.output_folder, "model", "adapter_config.json")) + os.remove(os.path.join(instruction_tuning_bolt.output.output_folder, "model", "training_args.bin")) + except Exception as _: + pass + + except Exception as e: + del instruction_tuning_bolt.model + del instruction_tuning_bolt.tokenizer + torch.cuda.empty_cache() + raise + + +# Test for computing metrics +def test_instruction_tuning_bolt_compute_metrics(instruction_tuning_bolt, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForCausalLM" + tokenizer_class = "AutoTokenizer" + + instruction_tuning_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + logits = np.array([[0.6, 0.4], [0.4, 0.6]]) + labels = np.array([0, 1]) + eval_pred = EvalPrediction(predictions=logits, label_ids=labels) + instruction_tuning_bolt.load_models() + metrics = instruction_tuning_bolt.compute_metrics(eval_pred) + assert "bleu" in metrics diff --git a/geniusrise_text/language_model/__init__.py b/geniusrise_text/language_model/__init__.py new file mode 100644 index 0000000..497e4b2 --- /dev/null +++ b/geniusrise_text/language_model/__init__.py @@ -0,0 +1,18 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import LanguageModelAPI +from .bulk import LanguageModelBulk +from .fine_tune import LanguageModelFineTuner diff --git a/geniusrise_text/language_model/api.py b/geniusrise_text/language_model/api.py new file mode 100644 index 0000000..4332abe --- /dev/null +++ b/geniusrise_text/language_model/api.py @@ -0,0 +1,396 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, Optional, Iterator +from concurrent.futures import ThreadPoolExecutor +import asyncio +import cherrypy +from geniusrise import BatchInput, BatchOutput, State +from geniusrise.logging import setup_logger +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.protocol import CompletionRequest +import llama_cpp + +from geniusrise_text.base import TextAPI + + +class LanguageModelAPI(TextAPI): + r""" + LanguageModelAPI is a class for interacting with pre-trained language models to generate text. It allows for + customizable text generation via a CherryPy web server, handling requests and generating responses using + a specified language model. This class is part of the GeniusRise ecosystem for facilitating NLP tasks. + + Attributes: + model (Any): The loaded language model used for text generation. + tokenizer (Any): The tokenizer corresponding to the language model, used for processing input text. + + Methods: + complete(**kwargs: Any) -> Dict[str, Any]: Generates text based on provided prompts and model parameters. + + CLI Usage Example: + ```bash + genius LanguageModelAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id mistralai/Mistral-7B-v0.1-lol \ + listen \ + --args \ + model_name="mistralai/Mistral-7B-v0.1" \ + model_class="AutoModelForCausalLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float16" \ + quantization=0 \ + device_map="auto" \ + max_memory=None \ + torchscript=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + + or using VLLM: + ```bash + genius LanguageModelAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id mistralai/Mistral-7B-v0.1 \ + listen \ + --args \ + model_name="mistralai/Mistral-7B-v0.1" \ + model_class="AutoModelForCausalLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="bfloat16" \ + use_vllm=True \ + vllm_enforce_eager=True \ + vllm_max_model_len=2048 \ + concurrent_queries=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + + or using llama.cpp: + ```bash + genius LanguageModelAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + listen \ + --args \ + model_name="TheBloke/Mistral-7B-v0.1-GGUF" \ + model_class="AutoModelForCausalLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + use_llama_cpp=True \ + llama_cpp_filename="mistral-7b-v0.1.Q4_K_M.gguf" \ + llama_cpp_n_gpu_layers=35 \ + llama_cpp_n_ctx=32768 \ + concurrent_queries=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + """ + + model: Any + tokenizer: Any + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs: Any, + ): + """ + Initializes the LanguageModelAPI with configurations for the input, output, and state management, + along with any additional model-specific parameters. + + Args: + input (BatchInput): The configuration for input data handling. + output (BatchOutput): The configuration for output data handling. + state (State): The state management for the API. + **kwargs (Any): Additional keyword arguments for model configuration and API setup. + """ + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + self.vllm_server: Optional[OpenAIServingCompletion] = None + self.event_loop: Any = None + self.executor = ThreadPoolExecutor(max_workers=4) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def complete(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Handles POST requests to generate text based on a given prompt and model-specific parameters. This method + is exposed as a web endpoint through CherryPy and returns a JSON response containing the original prompt, + the generated text, and any additional returned information from the model. + + Args: + **kwargs (Any): Arbitrary keyword arguments containing the prompt, and any additional parameters + for the text generation model. + + Returns: + Dict[str, Any]: A dictionary with the original prompt, generated text, and other model-specific information. + + Example CURL Request: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/complete \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite a PRD for Oauth auth using keycloak\n\n### Response:", + "decoding_strategy": "generate", + "max_new_tokens": 1024, + "do_sample": true + }' | jq + ``` + """ + data = cherrypy.request.json + prompt = data.get("prompt") + decoding_strategy = data.get("decoding_strategy", "generate") + + data = data + if "decoding_strategy" in data: + del data["decoding_strategy"] + if "prompt" in data: + del data["prompt"] + + return { + "prompt": prompt, + "args": data, + "completion": self.generate(prompt=prompt, decoding_strategy=decoding_strategy, **data), + } + + def initialize_vllm(self): + self.vllm_server = OpenAIServingCompletion(engine=self.model, served_model=self.model_name) + self.event_loop = asyncio.new_event_loop() + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def complete_vllm(self, **kwargs: Any) -> Dict[str, Any]: + """ + Handles POST requests to generate chat completions using the VLLM (Versatile Language Learning Model) engine. + This method accepts various parameters for customizing the chat completion request, including message content, + generation settings, and more. + + Parameters: + - **kwargs (Any): Arbitrary keyword arguments. Expects data in JSON format containing any of the following keys: + - messages (Union[str, List[Dict[str, str]]]): The messages for the chat context. + - temperature (float, optional): The sampling temperature. Defaults to 0.7. + - top_p (float, optional): The nucleus sampling probability. Defaults to 1.0. + - n (int, optional): The number of completions to generate. Defaults to 1. + - max_tokens (int, optional): The maximum number of tokens to generate. + - stop (Union[str, List[str]], optional): Stop sequence to end generation. + - stream (bool, optional): Whether to stream the response. Defaults to False. + - presence_penalty (float, optional): The presence penalty. Defaults to 0.0. + - frequency_penalty (float, optional): The frequency penalty. Defaults to 0.0. + - logit_bias (Dict[str, float], optional): Adjustments to the logits of specified tokens. + - user (str, optional): An identifier for the user making the request. + - (Additional model-specific parameters) + + Returns: + Dict[str, Any]: A dictionary with the chat completion response or an error message. + + Example CURL Request: + ```bash + curl -v -X POST "http://localhost:3000/api/v1/complete_vllm" \ + -H "Content-Type: application/json" \ + -u "user:password" \ + -d '{ + "messages": ["Whats the weather like in London?"], + "temperature": 0.7, + "top_p": 1.0, + "n": 1, + "max_tokens": 50, + "stream": false, + "presence_penalty": 0.0, + "frequency_penalty": 0.0, + "logit_bias": {}, + "user": "example_user" + }' + ``` + This request asks the VLLM engine to generate a completion for the provided chat context, with specified generation settings. + """ + # Extract data from the POST request + data = cherrypy.request.json + + # Initialize VLLM server with chat template and response role if not already initialized + if not hasattr(self, "vllm_server") or self.vllm_server is None: + self.initialize_vllm() + + # Prepare the chat completion request + chat_request = CompletionRequest( + model=self.model_name, + prompt=data.get("messages"), + temperature=data.get("temperature", 0.7), + top_p=data.get("top_p", 1.0), + n=data.get("n", 1), + max_tokens=data.get("max_tokens"), + stop=data.get("stop", []), + stream=data.get("stream", False), + logprobs=data.get("logprobs", None), + presence_penalty=data.get("presence_penalty", 0.0), + frequency_penalty=data.get("frequency_penalty", 0.0), + logit_bias=data.get("logit_bias", {}), + user=data.get("user"), + best_of=data.get("best_of"), + top_k=data.get("top_k", -1), + ignore_eos=data.get("ignore_eos", False), + use_beam_search=data.get("use_beam_search", False), + stop_token_ids=data.get("stop_token_ids", []), + skip_special_tokens=data.get("skip_special_tokens", True), + spaces_between_special_tokens=data.get("spaces_between_special_tokens", True), + echo=data.get("echo", False), + repetition_penalty=data.get("repetition_penalty", 1.0), + min_p=data.get("min_p", 0.0), + include_stop_str_in_output=data.get("include_stop_str_in_output", False), + length_penalty=data.get("length_penalty", 1.0), + ) + + # Generate chat completion using the VLLM engine + try: + + class DummyObject: + async def is_disconnected(self): + return False + + async def async_call(): + response = await self.vllm_server.create_completion(request=chat_request, raw_request=DummyObject()) + return response + + chat_completion = asyncio.run(async_call()) + + return chat_completion.model_dump() if chat_completion else {"error": "Failed to generate lm completion"} + except Exception as e: + self.log.exception("Error generating chat completion: %s", str(e)) + raise e + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def complete_llama_cpp(self, **kwargs: Any) -> Dict[str, Any]: + """ + Handles POST requests to generate chat completions using the llama.cpp engine. This method accepts various + parameters for customizing the chat completion request, including messages, sampling settings, and more. + + Args: + prompt: The prompt to generate text from. + suffix: A suffix to append to the generated text. If None, no suffix is appended. + max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. + temperature: The temperature to use for sampling. + top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 + typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. + logprobs: The number of logprobs to return. If None, no logprobs are returned. + echo: Whether to echo the prompt. + stop: A list of strings to stop generation when encountered. + frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. + presence_penalty: The penalty to apply to tokens based on their presence in the prompt. + repeat_penalty: The penalty to apply to repeated tokens. + top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + stream: Whether to stream the results. + seed: The seed to use for sampling. + tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. + mirostat_mode: The mirostat sampling mode. + mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + model: The name to use for the model in the completion object. + stopping_criteria: A list of stopping criteria to use. + logits_processor: A list of logits processors to use. + grammar: A grammar to use for constrained sampling. + logit_bias: A logit bias to use. + + Returns: + Dict[str, Any]: A dictionary containing the chat completion response or an error message. + + Example CURL Request: + ```bash + curl -X POST "http://localhost:3001/api/v1/complete_llama_cpp" \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Whats the weather like in London?", + "temperature": 0.7, + "top_p": 0.95, + "top_k": 40, + "max_tokens": 50, + "repeat_penalty": 1.1 + }' + ``` + """ + # Ensure llama.cpp model and necessary configurations are loaded and initialized + if not self.model or not isinstance(self.model, llama_cpp.Llama): + raise ValueError( + "llama.cpp model is not initialized. Please initialize the model before using chat_llama_cpp." + ) + + # Extract data from the POST request + data = cherrypy.request.json + + # Convert the request data to the format expected by llama.cpp's create_chat_completion method + try: + response = self.model.create_completion( + prompt=data.get("prompt"), + suffix=data.get("suffix", None), + max_tokens=data.get("max_tokens", 16), + temperature=data.get("temperature", 0.8), + top_p=data.get("top_p", 0.95), + min_p=data.get("min_p", 0.05), + typical_p=data.get("typical_p", 1.0), + logprobs=data.get("logprobs", None), + echo=data.get("echo", False), + stop=data.get("stop", []), + frequency_penalty=data.get("frequency_penalty", 0.0), + presence_penalty=data.get("presence_penalty", 0.0), + repeat_penalty=data.get("repeat_penalty", 1.1), + top_k=data.get("top_k", 40), + # stream=data.get("stream", False), + seed=data.get("seed", None), + tfs_z=data.get("tfs_z", 1.0), + mirostat_mode=data.get("mirostat_mode", 0), + mirostat_tau=data.get("mirostat_tau", 5.0), + mirostat_eta=data.get("mirostat_eta", 0.1), + model=data.get("model", None), + stopping_criteria=data.get("stopping_criteria", None), + logits_processor=data.get("logits_processor", None), + grammar=data.get("grammar", None), + logit_bias=data.get("logit_bias", None), + ) + except Exception as e: + self.log.exception("Error generating chat completion using llama.cpp: %s", str(e)) + return {"error": str(e)} + + # Return the generated chat completion or stream of completions + return response if not isinstance(response, Iterator) else list(response) diff --git a/geniusrise_text/language_model/api.yml b/geniusrise_text/language_model/api.yml new file mode 100644 index 0000000..56abd9a --- /dev/null +++ b/geniusrise_text/language_model/api.yml @@ -0,0 +1,72 @@ +openapi: 3.0.0 +info: + title: Language Model API + description: API for generating text based on prompts using pre-trained language models. + version: "1.0" +servers: + - url: http://localhost:3000/api/v1 + description: API server for language model text generation +paths: + /complete: + post: + summary: Generates text based on a given prompt and optional parameters + operationId: generateText + tags: + - Text Generation + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + prompt: + type: string + description: Input prompt for the text generation model. + decoding_strategy: + type: string + description: Strategy to use for decoding the generated text. Defaults to "generate". + enum: [generate, greedy_search, beam_search, sample, top_k, top_p] + default: "generate" + max_new_tokens: + type: integer + description: Maximum number of new tokens to generate. + do_sample: + type: boolean + description: Whether to use sampling for generation. + temperature: + type: number + description: Temperature for sampling, controlling the randomness of predictions by scaling the logits before applying softmax. + top_k: + type: integer + description: The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: + type: number + format: float + description: Nucleus filtering (top-p) cumulative probability. Keeps the top tokens with cumulative probability >= top_p. + num_return_sequences: + type: integer + description: The number of sequences to return. Defaults to 1. + required: + - prompt + responses: + 200: + description: Successful response with generated text + content: + application/json: + schema: + type: object + properties: + prompt: + type: string + description: The original prompt that was provided. + completion: + type: string + description: The generated text based on the prompt. + args: + type: object + description: The arguments that were used for the generation. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failed to generate text diff --git a/geniusrise_text/language_model/bulk.py b/geniusrise_text/language_model/bulk.py new file mode 100644 index 0000000..cc648e9 --- /dev/null +++ b/geniusrise_text/language_model/bulk.py @@ -0,0 +1,793 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional, Union +import llama_cpp +import pandas as pd +import yaml # type: ignore +from datasets import Dataset, load_from_disk +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather +from pyarrow import parquet as pq +from vllm import LLM, SamplingParams +from transformers.tokenization_utils_base import PreTrainedTokenizerBase + +from geniusrise_text.base import TextBulk + + +class LanguageModelBulk(TextBulk): + r""" + LanguageModelBulk is designed for large-scale text generation using Hugging Face language models in a bulk processing + manner. It's particularly useful for tasks such as bulk content creation, summarization, or any other scenario where + large datasets need to be processed with a language model. + + Attributes: + model (Any): The loaded language model used for text generation. + tokenizer (Any): The tokenizer corresponding to the language model, used for processing input text. + + Args: + input (BatchInput): Configuration for the input data. + output (BatchOutput): Configuration for the output data. + state (State): State management for the API. + **kwargs (Any): Arbitrary keyword arguments for extended functionality. + + CLI Usage Example: + ```bash + genius LanguageModelBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/lm \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/lm \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id mistralai/Mistral-7B-Instruct-v0.1-lol \ + complete \ + --args \ + model_name="mistralai/Mistral-7B-Instruct-v0.1" \ + model_class="AutoModelForCausalLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="bfloat16" \ + quantization=0 \ + device_map="auto" \ + max_memory=None \ + torchscript=False \ + decoding_strategy="generate" \ + generation_max_new_tokens=100 \ + generation_do_sample=true + ``` + + or using VLLM: + ```bash + genius LanguageModelBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/lm \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/lm \ + none \ + --id mistralai/Mistral-7B-v0.1 \ + complete_vllm \ + --args \ + model_name="mistralai/Mistral-7B-v0.1" \ + use_cuda=True \ + precision="bfloat16" \ + quantization=0 \ + device_map="auto" \ + vllm_enforce_eager=True \ + generation_temperature=0.7 \ + generation_top_p=1.0 \ + generation_n=1 \ + generation_max_tokens=50 \ + generation_stream=false \ + generation_presence_penalty=0.0 \ + generation_frequency_penalty=0.0 + ``` + + or using llama.cpp: + ```bash + genius LanguageModelBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/chat \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/chat \ + none \ + complete_llama_cpp \ + --args \ + model="TheBloke/Mistral-7B-v0.1-GGUF" \ + filename="mistral-7b-v0.1.Q4_K_M.gguf" \ + n_gpu_layers=35 \ + n_ctx=32768 \ + generation_temperature=0.7 \ + generation_top_p=0.95 \ + generation_top_k=40 \ + generation_max_tokens=50 \ + generation_repeat_penalty=0.1 + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + """ + Initializes the LanguageModelBulk object with the specified configurations for input, output, and state. + + Args: + input (BatchInput): Configuration and data inputs for the bulk process. + output (BatchOutput): Configurations for output data handling. + state (State): State management for the bulk process. + **kwargs (Any): Additional keyword arguments for extended configurations. + """ + super().__init__(input, output, state, **kwargs) + + def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs) -> Optional[Dataset]: + r""" + Load a completion dataset from a directory. + + Args: + dataset_path (str): The path to the dataset directory. + max_length (int, optional): The maximum length for tokenization. Defaults to 512. + **kwargs: Additional keyword arguments to pass to the underlying dataset loading functions. + + Returns: + Dataset: The loaded dataset. + + Raises: + Exception: If there was an error loading the dataset. + + ## Supported Data Formats and Structures: + + ### Dataset files saved by Hugging Face datasets library + The directory should contain 'dataset_info.json' and other related files. + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"text": "The text content"} + ``` + + ### CSV + Should contain 'text' column. + ```csv + text + "The text content" + ``` + + ### Parquet + Should contain 'text' column. + + ### JSON + An array of dictionaries with 'text' key. + ```json + [{"text": "The text content"}] + ``` + + ### XML + Each 'record' element should contain 'text' child element. + ```xml + + The text content + + ``` + + ### YAML + Each document should be a dictionary with 'text' key. + ```yaml + - text: "The text content" + ``` + + ### TSV + Should contain 'text' column separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'text' column. + + ### SQLite (.db) + Should contain a table with 'text' column. + + ### Feather + Should contain 'text' column. + """ + + self.max_length = max_length + + if hasattr(self, "tokenizer") and self.tokenizer is not None: + self.label_to_id = self.model.config.label2id if self.model and self.model.config.label2id else {} # type: ignore + + try: + self.log.info(f"Loading dataset from {dataset_path}") + if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + # Load dataset saved by Hugging Face datasets library + return load_from_disk(dataset_path) + else: + data = [] + for filename in glob.glob(f"{dataset_path}/**/*", recursive=True): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + text = record.find("text").text # type: ignore + data.append({"text": text}) + + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT text FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + data = [fn(d) for d in data] + else: + data = data + + return Dataset.from_pandas(pd.DataFrame(data)) + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def complete( + self, + model_name: str, + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + decoding_strategy: str = "generate", + notification_email: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Performs text completion on the loaded dataset using the specified model and tokenizer. The method handles the + entire process, including model loading, text generation, and saving the results. + + Args: + model_name (str): The name of the language model to use for text completion. + model_class (str, optional): The class of the language model. Defaults to "AutoModelForCausalLM". + tokenizer_class (str, optional): The class of the tokenizer. Defaults to "AutoTokenizer". + use_cuda (bool, optional): Whether to use CUDA for model inference. Defaults to False. + precision (str, optional): Precision for model computation. Defaults to "float16". + quantization (int, optional): Level of quantization for optimizing model size and speed. Defaults to 0. + device_map (str | Dict | None, optional): Specific device to use for computation. Defaults to "auto". + max_memory (Dict, optional): Maximum memory configuration for devices. Defaults to {0: "24GB"}. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool, optional): Whether to enable AWQ optimization. Defaults to False. + flash_attention (bool, optional): Whether to use flash attention optimization. Defaults to False. + decoding_strategy (str, optional): Strategy for decoding the completion. Defaults to "generate". + **kwargs: Additional keyword arguments for text generation. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.notification_email = notification_email + self.compile = compile + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["text"] + + prompts = [] + completions = [] + for _, prompt in enumerate(dataset): + completion = self.generate( + prompt=prompt, + decoding_strategy=decoding_strategy, + **generation_args, + ) + completions.append(completion) + prompts.append(prompt) + + self._save_completions(completions, prompts, output_path) + self.done() + + def complete_vllm( + self, + model_name: str, + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + # VLLM params + vllm_tokenizer_mode: str = "auto", + vllm_download_dir: Optional[str] = None, + vllm_load_format: str = "auto", + vllm_seed: int = 42, + vllm_max_model_len: int = 1024, + vllm_enforce_eager: bool = False, + vllm_max_context_len_to_capture: int = 8192, + vllm_block_size: int = 16, + vllm_gpu_memory_utilization: float = 0.90, + vllm_swap_space: int = 4, + vllm_sliding_window: Optional[int] = None, + vllm_pipeline_parallel_size: int = 1, + vllm_tensor_parallel_size: int = 1, + vllm_worker_use_ray: bool = False, + vllm_max_parallel_loading_workers: Optional[int] = None, + vllm_disable_custom_all_reduce: bool = False, + vllm_max_num_batched_tokens: Optional[int] = None, + vllm_max_num_seqs: int = 64, + vllm_max_paddings: int = 512, + vllm_max_lora_rank: Optional[int] = None, + vllm_max_loras: Optional[int] = None, + vllm_max_cpu_loras: Optional[int] = None, + vllm_lora_extra_vocab_size: int = 0, + vllm_placement_group: Optional[dict] = None, + vllm_log_stats: bool = False, + # Generate params + notification_email: Optional[str] = None, + batch_size: int = 32, + **kwargs: Any, + ) -> None: + """ + Performs bulk text generation using the Versatile Language Learning Model (VLLM) with specified parameters + for fine-tuning model behavior, including quantization and parallel processing settings. This method is designed + to process large datasets efficiently by leveraging VLLM capabilities for generating high-quality text completions + based on provided prompts. + + Args: + model_name (str): The name or path of the VLLM model to use for text generation. + use_cuda (bool): Flag indicating whether to use CUDA for GPU acceleration. + precision (str): Precision of computations, can be "float16", "bfloat16", etc. + quantization (int): Level of quantization for model weights, 0 for none. + device_map (str | Dict | None): Specific device(s) to use for model inference. + vllm_tokenizer_mode (str): Mode of the tokenizer ("auto", "fast", or "slow"). + vllm_download_dir (Optional[str]): Directory to download and load the model and tokenizer. + vllm_load_format (str): Format to load the model, e.g., "auto", "pt". + vllm_seed (int): Seed for random number generation. + vllm_max_model_len (int): Maximum sequence length the model can handle. + vllm_enforce_eager (bool): Enforce eager execution instead of using optimization techniques. + vllm_max_context_len_to_capture (int): Maximum context length for CUDA graph capture. + vllm_block_size (int): Block size for caching mechanism. + vllm_gpu_memory_utilization (float): Fraction of GPU memory to use. + vllm_swap_space (int): Amount of swap space to use in GiB. + vllm_sliding_window (Optional[int]): Size of the sliding window for processing. + vllm_pipeline_parallel_size (int): Number of pipeline parallel groups. + vllm_tensor_parallel_size (int): Number of tensor parallel groups. + vllm_worker_use_ray (bool): Whether to use Ray for model workers. + vllm_max_parallel_loading_workers (Optional[int]): Maximum number of workers for parallel loading. + vllm_disable_custom_all_reduce (bool): Disable custom all-reduce kernel and fall back to NCCL. + vllm_max_num_batched_tokens (Optional[int]): Maximum number of tokens to be processed in a single iteration. + vllm_max_num_seqs (int): Maximum number of sequences to be processed in a single iteration. + vllm_max_paddings (int): Maximum number of paddings to be added to a batch. + vllm_max_lora_rank (Optional[int]): Maximum rank for LoRA adjustments. + vllm_max_loras (Optional[int]): Maximum number of LoRA adjustments. + vllm_max_cpu_loras (Optional[int]): Maximum number of LoRA adjustments stored on CPU. + vllm_lora_extra_vocab_size (int): Additional vocabulary size for LoRA. + vllm_placement_group (Optional[dict]): Ray placement group for distributed execution. + vllm_log_stats (bool): Whether to log statistics during model operation. + notification_email (Optional[str]): Email to send notifications upon completion. + batch_size (int): Number of prompts to process in each batch for efficient memory usage. + **kwargs: Additional keyword arguments for generation settings like temperature, top_p, etc. + + This method automates the loading of large datasets, generation of text completions, and saving results, + facilitating efficient and scalable text generation tasks. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.notification_email = notification_email + + self.model: LLM = self.load_models_vllm( + model=model_name, + tokenizer=tokenizer_name, + tokenizer_mode=vllm_tokenizer_mode, + trust_remote_code=True, + download_dir=vllm_download_dir, + load_format=vllm_load_format, + dtype=self._get_torch_dtype(precision), + seed=vllm_seed, + revision=model_revision, + tokenizer_revision=tokenizer_revision, + max_model_len=vllm_max_model_len, + quantization=(None if quantization == 0 else f"{quantization}-bit"), + enforce_eager=vllm_enforce_eager, + max_context_len_to_capture=vllm_max_context_len_to_capture, + block_size=vllm_block_size, + gpu_memory_utilization=vllm_gpu_memory_utilization, + swap_space=vllm_swap_space, + cache_dtype="auto", + sliding_window=vllm_sliding_window, + pipeline_parallel_size=vllm_pipeline_parallel_size, + tensor_parallel_size=vllm_tensor_parallel_size, + worker_use_ray=vllm_worker_use_ray, + max_parallel_loading_workers=vllm_max_parallel_loading_workers, + disable_custom_all_reduce=vllm_disable_custom_all_reduce, + max_num_batched_tokens=vllm_max_num_batched_tokens, + max_num_seqs=vllm_max_num_seqs, + max_paddings=vllm_max_paddings, + device="cuda" if use_cuda else "cpu", + max_lora_rank=vllm_max_lora_rank, + max_loras=vllm_max_loras, + max_cpu_loras=vllm_max_cpu_loras, + lora_dtype=self._get_torch_dtype(precision), + lora_extra_vocab_size=vllm_lora_extra_vocab_size, + placement_group=vllm_placement_group, # type: ignore + log_stats=vllm_log_stats, + batched_inference=True, + ) + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["text"] + + for i in range(0, len(dataset), batch_size): + batch = dataset[i : i + batch_size] + + outputs = self.model.generate( + prompts=batch, + sampling_params=SamplingParams( + n=generation_args.get("n", 1), + best_of=generation_args.get("best_of", None), + presence_penalty=generation_args.get("presence_penalty", 0.0), + frequency_penalty=generation_args.get("frequency_penalty", 0.0), + repetition_penalty=generation_args.get("repetition_penalty", 1.0), + temperature=generation_args.get("temperature", 1.0), + top_p=generation_args.get("top_p", 1.0), + top_k=generation_args.get("top_k", -1), + min_p=generation_args.get("min_p", 0.0), + use_beam_search=generation_args.get("use_beam_search", False), + length_penalty=generation_args.get("length_penalty", 1.0), + early_stopping=generation_args.get("early_stopping", False), + stop=generation_args.get("stop", None), + stop_token_ids=generation_args.get("stop_token_ids", None), + include_stop_str_in_output=generation_args.get("include_stop_str_in_output", False), + ignore_eos=generation_args.get("ignore_eos", False), + max_tokens=generation_args.get("max_tokens", 16), + logprobs=generation_args.get("logprobs", None), + prompt_logprobs=generation_args.get("prompt_logprobs", None), + skip_special_tokens=generation_args.get("skip_special_tokens", True), + spaces_between_special_tokens=generation_args.get("spaces_between_special_tokens", True), + logits_processors=generation_args.get("logits_processors", None), + ), + ) + completions = [" ".join(t.text for t in o.outputs) for o in outputs] + self._save_completions(completions, batch, output_path) + self.done() + + def complete_llama_cpp( + self, + model: str, + filename: Optional[str] = None, + local_dir: Optional[Union[str, os.PathLike[str]]] = None, + n_gpu_layers: int = 0, + split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER, + main_gpu: int = 0, + tensor_split: Optional[List[float]] = None, + vocab_only: bool = False, + use_mmap: bool = True, + use_mlock: bool = False, + kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None, + seed: int = llama_cpp.LLAMA_DEFAULT_SEED, + n_ctx: int = 512, + n_batch: int = 512, + n_threads: Optional[int] = None, + n_threads_batch: Optional[int] = None, + rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED, + rope_freq_base: float = 0.0, + rope_freq_scale: float = 0.0, + yarn_ext_factor: float = -1.0, + yarn_attn_factor: float = 1.0, + yarn_beta_fast: float = 32.0, + yarn_beta_slow: float = 1.0, + yarn_orig_ctx: int = 0, + mul_mat_q: bool = True, + logits_all: bool = False, + embedding: bool = False, + offload_kqv: bool = True, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_scale: float = 1.0, + lora_path: Optional[str] = None, + numa: Union[bool, int] = False, + chat_format: Optional[str] = None, + chat_handler: Optional[llama_cpp.llama_chat_format.LlamaChatCompletionHandler] = None, + draft_model: Optional[llama_cpp.LlamaDraftModel] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + verbose: bool = True, + notification_email: Optional[str] = None, + **kwargs, + ) -> None: + """ + Performs bulk text generation using the LLaMA model with llama.cpp backend. This method handles the entire + process, including model loading, prompt processing, text generation, and saving the results. + + Args: + model: Path or identifier for the LLaMA model. + filename: Optional filename or glob pattern to match the model file. + local_dir: Local directory to save the model files. + n_gpu_layers: Number of layers to offload to GPU. + split_mode: Split mode for distributing model across GPUs. + main_gpu: Main GPU index. + tensor_split: Configuration for tensor splitting across GPUs. + vocab_only: Whether to load only the vocabulary. + use_mmap: Use memory-mapped files for model loading. + use_mlock: Lock model data in RAM to prevent swapping. + kv_overrides: Key-value pairs for overriding model config. + seed: Seed for random number generation. + n_ctx: Number of context tokens for generation. + n_batch: Batch size for processing. + n_threads: Number of threads for generation. + n_threads_batch: Number of threads for batch processing. + rope_scaling_type: Scaling type for RoPE. + rope_freq_base: Base frequency for RoPE. + rope_freq_scale: Frequency scaling for RoPE. + yarn_ext_factor: YaRN extrapolation factor. + yarn_attn_factor: YaRN attention factor. + yarn_beta_fast: YaRN beta fast parameter. + yarn_beta_slow: YaRN beta slow parameter. + yarn_orig_ctx: Original context size for YaRN. + mul_mat_q: Multiply matrices for queries. + logits_all: Return logits for all tokens. + embedding: Enable embedding mode. + offload_kqv: Offload K, Q, V matrices to GPU. + last_n_tokens_size: Size for the last_n_tokens buffer. + lora_base: Base model path for LoRA. + lora_scale: Scale factor for LoRA adjustments. + lora_path: Path for LoRA adjustments. + numa: NUMA configuration. + chat_format: Chat format configuration. + chat_handler: Handler for chat completions. + draft_model: Draft model for speculative decoding. + tokenizer: Custom tokenizer instance. + verbose: Enable verbose logging. + notification_email (Optional[str]): Email to send notifications upon completion. + **kwargs: Additional arguments for model loading and text generation. + """ + self.notification_email = notification_email + + # Loading the LLaMA model with llama.cpp + llama_model, custom_tokenizer = self.load_models_llama_cpp( + model=model, + filename=filename, + local_dir=local_dir, + n_gpu_layers=n_gpu_layers, + split_mode=split_mode, + main_gpu=main_gpu, + tensor_split=tensor_split, + vocab_only=vocab_only, + use_mmap=use_mmap, + use_mlock=use_mlock, + kv_overrides=kv_overrides, + seed=seed, + n_ctx=n_ctx, + n_batch=n_batch, + n_threads=n_threads, + n_threads_batch=n_threads_batch, + rope_scaling_type=rope_scaling_type, + rope_freq_base=rope_freq_base, + rope_freq_scale=rope_freq_scale, + yarn_ext_factor=yarn_ext_factor, + yarn_attn_factor=yarn_attn_factor, + yarn_beta_fast=yarn_beta_fast, + yarn_beta_slow=yarn_beta_slow, + yarn_orig_ctx=yarn_orig_ctx, + mul_mat_q=mul_mat_q, + logits_all=logits_all, + embedding=embedding, + offload_kqv=offload_kqv, + last_n_tokens_size=last_n_tokens_size, + lora_base=lora_base, + lora_scale=lora_scale, + lora_path=lora_path, + numa=numa, + chat_format=chat_format, + chat_handler=chat_handler, + draft_model=draft_model, + tokenizer=tokenizer, + verbose=verbose, + **kwargs, + ) + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["instruction"] + + for i in range(0, len(dataset), n_batch): + batch = dataset[i : i + n_batch] + completions = [] + + for prompt in batch: + # Generate completion for each prompt using llama_model + completion = llama_model.create_completion( + prompt=prompt, + suffix=generation_args.get("suffix", None), + max_tokens=generation_args.get("max_tokens", 16), + temperature=generation_args.get("temperature", 0.8), + top_p=generation_args.get("top_p", 0.95), + min_p=generation_args.get("min_p", 0.05), + typical_p=generation_args.get("typical_p", 1.0), + logprobs=generation_args.get("logprobs", None), + echo=generation_args.get("echo", False), + stop=generation_args.get("stop", []), + frequency_penalty=generation_args.get("frequency_penalty", 0.0), + presence_penalty=generation_args.get("presence_penalty", 0.0), + repeat_penalty=generation_args.get("repeat_penalty", 1.1), + top_k=generation_args.get("top_k", 40), + seed=generation_args.get("seed", None), + tfs_z=generation_args.get("tfs_z", 1.0), + mirostat_mode=generation_args.get("mirostat_mode", 0), + mirostat_tau=generation_args.get("mirostat_tau", 5.0), + mirostat_eta=generation_args.get("mirostat_eta", 0.1), + model=generation_args.get("model", None), + stopping_criteria=generation_args.get("stopping_criteria", None), + logits_processor=generation_args.get("logits_processor", None), + grammar=generation_args.get("grammar", None), + logit_bias=generation_args.get("logit_bias", None), + ) + completions.append(completion) + + self._save_completions([c["choices"][0]["text"] for c in completions], batch, output_path) # type: ignore + self.done() + + def _save_completions(self, completions: List[str], prompts: List[str], output_path: str) -> None: + """ + Saves the generated completions to the specified output path. + + Args: + completions (List[str]): The list of generated text completions. + prompts (List[str]): The list of prompts corresponding to the completions. + output_path (str): The path to save the completion results. + + This method is called internally by the complete method to persist the completion results. + """ + data_to_save = [ + {"prompt": prompt, "completion": completion} for prompt, completion in zip(prompts, completions) + ] + with open(os.path.join(output_path, f"completions-{str(uuid.uuid4())}.json"), "w") as f: + json.dump(data_to_save, f) diff --git a/huggingface/language_model.py b/geniusrise_text/language_model/fine_tune.py similarity index 73% rename from huggingface/language_model.py rename to geniusrise_text/language_model/fine_tune.py index 105f7ff..c81862b 100644 --- a/huggingface/language_model.py +++ b/geniusrise_text/language_model/fine_tune.py @@ -1,19 +1,17 @@ # 🧠 Geniusrise # Copyright (C) 2023 geniusrise.ai # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import os @@ -26,14 +24,14 @@ import pyarrow.feather as feather import pyarrow.parquet as pq import yaml # type: ignore -from datasets import Dataset, load_from_disk, load_metric +from datasets import Dataset, load_from_disk, load_metric, load_dataset from nltk.translate.bleu_score import corpus_bleu from transformers import DataCollatorForLanguageModeling, EvalPrediction -from .base import HuggingFaceFineTuner +from geniusrise_text.base import TextFineTuner -class HuggingFaceLanguageModelingFineTuner(HuggingFaceFineTuner): +class LanguageModelFineTuner(TextFineTuner): r""" A bolt for fine-tuning Hugging Face models on language modeling tasks. @@ -42,67 +40,35 @@ class HuggingFaceLanguageModelingFineTuner(HuggingFaceFineTuner): output (OutputConfig): The output data. state (State): The state manager. - ## Using geniusrise to invoke via command line - ```bash - genius HuggingFaceLanguageModelingFineTuner rise \ - batch \ - --input_bucket my_bucket \ - --input_folder my_folder \ - streaming \ - --output_kafka_topic kafka_test \ - --output_kafka_cluster_connection_string localhost:9094 \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - load_dataset \ - --args dataset_path=my_dataset_path masked=True max_length=512 - ``` + CLI Usage: - ## Using geniusrise to invoke via YAML file - ```yaml - version: "1" - bolts: - my_fine_tuner: - name: "HuggingFaceLanguageModelingFineTuner" - method: "load_dataset" - args: - dataset_path: "my_dataset_path" - masked: True - max_length: 512 - input: - type: "batch" - args: - bucket: "my_bucket" - folder: "my_folder" - output: - type: "streaming" - args: - output_topic: "kafka_test" - kafka_servers: "localhost:9094" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "geniusrise" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_fine_tuner" - namespace: "default" - image: "my_fine_tuner_image" - replicas: 1 + ```bash + genius LanguageModelFineTuner rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/lm \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/lm \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id mistralai/Mistral-7B-Instruct-v0.1-lol \ + fine_tune \ + --args \ + model_name=my_model \ + tokenizer_name=my_tokenizer \ + num_train_epochs=3 \ + per_device_train_batch_size=8 \ + data_max_length=512 ``` """ - def load_dataset(self, dataset_path, masked: bool = True, max_length: int = 512, **kwargs): + def load_dataset(self, dataset_path, masked: bool = False, max_length: int = 512, **kwargs): r""" Load a language modeling dataset from a directory. @@ -175,7 +141,9 @@ def load_dataset(self, dataset_path, masked: bool = True, max_length: int = 512, self.max_length = max_length try: - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): # Load dataset saved by Hugging Face datasets library dataset = load_from_disk(dataset_path) else: @@ -233,9 +201,15 @@ def load_dataset(self, dataset_path, masked: bool = True, max_length: int = 512, dataset = Dataset.from_pandas(pd.DataFrame(data)) + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + # Preprocess the dataset if self.tokenizer and self.tokenizer.pad_token_id is None: - self.tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + self.tokenizer.pad_token = self.tokenizer.eos_token tokenized_dataset = dataset.map( self.prepare_train_features, batched=True, @@ -261,13 +235,10 @@ def prepare_train_features(self, examples): tokenized_inputs = self.tokenizer( examples["text"], truncation=True, - padding=False, + padding="max_length", max_length=self.max_length, ) - # Include the labels in the returned dictionary - tokenized_inputs["labels"] = tokenized_inputs["input_ids"] - return tokenized_inputs def data_collator(self, examples): diff --git a/geniusrise_text/language_model/tests/__init__.py b/geniusrise_text/language_model/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/language_model/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/language_model/tests/test_bulk.py b/geniusrise_text/language_model/tests/test_bulk.py new file mode 100644 index 0000000..b29e3d2 --- /dev/null +++ b/geniusrise_text/language_model/tests/test_bulk.py @@ -0,0 +1,280 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import itertools +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.language_model.bulk import LanguageModelBulk + + +@pytest.fixture( + params=[ + # model_name, model_class, tokenizer_class, use_cuda, precision, quantization, device_map, max_memory, torchscript + # fmt: off + ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", 0, None, None, False), + ("gpt2", "AutoModelForCausalLM", "AutoTokenizer", False, "float32", 0, None, None, False), + ("bigscience/bloom-560m", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, None, None, False), + ("meta-llama/Llama-2-7b-hf", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, None, None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 8, "cuda:0", None, False), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 0, "auto", None, True), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 4, "auto", None, True), + ("mistralai/Mistral-7B-v0.1", "AutoModelForCausalLM", "AutoTokenizer", True, "bfloat16", 8, "auto", None, True), + ("TheBloke/Mistral-7B-v0.1-GPTQ:gptq-4bit-32g-actorder_True", "AutoModelForCausalLM", "AutoTokenizer", True, "float16", None, "cuda:0", None, False), + # fmt: on + ] +) +def model_config(request): + return request.param + + +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"text": f"text_{i}"} for i in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "text").text = item["text"] + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "jsonl", + "parquet", + "json", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir, ext) + return tmpdir, ext + + +# Fixtures to initialize LanguageModelBulk instance +@pytest.fixture +def lm_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + + input = BatchInput(input_dir, "geniusrise-test", "api_input") + output = BatchOutput(output_dir, "geniusrise-test", "api_output") + state = InMemoryState() + + lm_bolt = LanguageModelBulk( + input=input, + output=output, + state=state, + ) + yield lm_bolt + + +# Define strategies and associated parameters +strategies = { + "generate": {}, + "greedy_search": {}, + "beam_search": {"num_beams": 4}, + "beam_sample": {"num_beams": 4, "temperature": 0.7, "top_k": 20}, + "group_beam_search": {"num_beams": 4, "num_beam_groups": 2}, +} + +# Define other parameters +length_params = { + "max_length": [20, 30], + "min_length": [0, 10], + "early_stopping": [False, True], +} +gen_strategy_params = { + "do_sample": [False, True], +} +logit_params = { + "temperature": [1.0, 0.7], + "top_k": [50, 20], + "top_p": [1.0, 0.9], + "repetition_penalty": [1.0, 1.5], + "length_penalty": [1.0, 0.5], + "no_repeat_ngram_size": [0, 2], +} +# Merge all the parameters into one dictionary for itertools.product +all_params = {**length_params, **gen_strategy_params, **logit_params} + + +@pytest.mark.parametrize("strategy", list(strategies.keys())) +def test_generate_strategies(lm_bolt, model_config, dataset_file, strategy): + ( + model_name, + model_class, + tokenizer_class, + use_cuda, + precision, + quantization, + device_map, + max_memory, + torchscript, + ) = model_config + + tmpdir, ext = dataset_file + lm_bolt.input.input_folder = tmpdir + + if ":" in model_name: + _model_name = model_name + model_revision = _model_name.split(":")[1] + model_name = _model_name.split(":")[0] + tokenizer_revision = _model_name.split(":")[1] + tokenizer_name = _model_name.split(":")[0] + else: + model_revision = None + tokenizer_revision = None + + # Strategy-specific params + strategy_params = strategies[strategy] + + # All possible combinations for the current strategy + param_combinations = [ + {**dict(zip(all_params.keys(), values)), **strategy_params} + for values in itertools.product(*all_params.values()) + ] + + for param_set in param_combinations: + param_set = {f"generation_{k}": v for k, v in param_set.items()} + + generated_text = lm_bolt.complete( + model_name=model_name, + model_revision=model_revision, + tokenizer_name=model_name, + tokenizer_revision=tokenizer_revision, + model_class=model_class, + tokenizer_class=tokenizer_class, + use_cuda=use_cuda, + precision=precision, + quantization=quantization, + device_map=device_map, + max_memory=max_memory, + torchscript=torchscript, + decoding_strategy=strategy, + **param_set, # Unpack params into function arguments + ) + files = glob.glob(f"{lm_bolt.output.output_folder}/completions-*.json") + assert len(files) > 0 + break + + # Cleanup + del lm_bolt.model + del lm_bolt.tokenizer + torch.cuda.empty_cache() + + +# HuggingFaceH4/zephyr-7b-beta +# openchat/openchat_3.5 +# mistralai/Mistral-7B-v0.1 +# amazon/MistralLite +# codellama/CodeLlama-7b-hf +# codellama/CodeLlama-7b-Python-hf +# codellama/CodeLlama-13b-hf +# codellama/CodeLlama-13b-Python-hf +# codellama/CodeLlama-34b-hf +# codellama/CodeLlama-34b-Python-hf +# meta-llama/Llama-2-7b-hf +# meta-llama/Llama-2-13b-hf +# meta-llama/Llama-2-70b-hf +# TheBloke/Mistral-7B-v0.1-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/Mistral-7B-v0.1-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/openchat_3.5-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/openchat_3.5-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/zephyr-7b-beta-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/zephyr-7b-beta-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/CodeLlama-7b-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/CodeLlama-7b-hf-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/CodeLlama-7b-Python-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/CodeLlama-7b-Python-hf-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/CodeLlama-13b-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/CodeLlama-13b-hf-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/CodeLlama-13b-Python-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/CodeLlama-13b-Python-hf-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/CodeLlama-34b-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/CodeLlama-34b-hf-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/CodeLlama-34b-Python-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/CodeLlama-34b-Python-hf-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/Llama-2-7b-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/Llama-2-7b-hf-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/Llama-2-13b-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/Llama-2-13b-hf-GPTQ:gptq-8bit-32g-actorder_True +# TheBloke/Llama-2-70b-hf-GPTQ:gptq-4bit-32g-actorder_True +# TheBloke/Llama-2-70b-hf-GPTQ:gptq-8bit-32g-actorder_True +# WizardLM/WizardCoder-Python-7B-V1.0 +# WizardLM/WizardCoder-Python-13B-V1.0 +# WizardLM/WizardCoder-Python-34B-V1.0 +# WizardLMTeam/WizardLM-13B-V1.0 +# WizardLM/WizardLM-70B-V1.0 +# TheBloke/WizardCoder-Python-7B-V1.0-GPTQ +# TheBloke/WizardCoder-Python-13B-V1.0-GPTQ +# TheBloke/WizardCoder-Python-34B-V1.0-GPTQ +# TheBloke/WizardLM-13B-V1.0-GPTQ +# TheBloke/WizardLM-70B-V1.0-GPTQ diff --git a/geniusrise_text/language_model/tests/test_fine_tune.py b/geniusrise_text/language_model/tests/test_fine_tune.py new file mode 100644 index 0000000..f9ee84b --- /dev/null +++ b/geniusrise_text/language_model/tests/test_fine_tune.py @@ -0,0 +1,311 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import numpy as np +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, EvalPrediction, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.language_model import LanguageModelFineTuner + +lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", +} + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"text": f"text_{i}"} for i in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "text").text = item["text"] + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +MODELS_TO_TEST = { + # fmt: off + "small": "bigscience/bloom-560m", + # fmt: on +} + + +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "json", + "jsonl", + "parquet", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir + "/train", ext) + create_dataset_in_format(tmpdir + "/test", ext) + return tmpdir, ext + + +@pytest.fixture +def language_modeling_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + klass = LanguageModelFineTuner( + input=input, + output=output, + state=state, + ) + return klass + + +def test_language_modeling_bolt_init(model, language_modeling_bolt): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForCausalLM" + tokenizer_class = "AutoTokenizer" + + language_modeling_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + assert language_modeling_bolt.model is not None + assert language_modeling_bolt.tokenizer is not None + assert language_modeling_bolt.input is not None + assert language_modeling_bolt.output is not None + assert language_modeling_bolt.state is not None + + +def test_load_dataset_all_formats(language_modeling_bolt, dataset_file, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForCausalLM" + tokenizer_class = "AutoTokenizer" + + language_modeling_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + tmpdir, ext = dataset_file + dataset_path = os.path.join(tmpdir, "train") + dataset = language_modeling_bolt.load_dataset(dataset_path) + assert dataset is not None + assert len(dataset) == 10 + + +# Models to test +models = { + # fmt: off + "small": "bigscience/bloom-560m", + "medium": "meta-llama/Llama-2-7b-hf", + "large": "mistralai/Mistral-7B-v0.1", + "4-bit": "TheBloke/Mistral-7B-v0.1-GPTQ:gptq-4bit-32g-actorder_True", + "8-bit": "TheBloke/OpenHermes-2-Mistral-7B-GPTQ:gptq-8bit-128g-actorder_True", + "4-bit-mistral": "TheBloke/Mistral-7B-v0.1-GPTQ:gptq-4bit-32g-actorder_True", + "4-bit-openhermes": "TheBloke/OpenHermes-2-Mistral-7B-GPTQ:gptq-8bit-128g-actorder_True", + "4-bit-zephyr": "TheBloke/zephyr-7B-beta-GPTQ:gptq-4bit-32g-actorder_True", + "4-bit-wizard": "TheBloke/WizardLM-7B-uncensored-GPTQ", + "4-bit-wizard-vicuna": "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ", + "4-bit-mistral-code": "TheBloke/Mistral-7B-Code-16K-qlora-GPTQ", + # fmt: on +} + + +@pytest.mark.parametrize( + "model_name, precision, quantization, lora_config, use_accelerate", + [ + # small + (models["small"], "float16", None, None, False), + (models["small"], "float16", None, None, True), + (models["small"], "float16", None, lora_config, False), + (models["small"], "float16", None, lora_config, True), + (models["small"], "float32", None, None, False), + (models["small"], "float32", None, None, True), + (models["small"], "float32", None, lora_config, False), + (models["small"], "float32", None, lora_config, True), + (models["small"], "bfloat16", None, None, False), + (models["small"], "bfloat16", None, None, True), + (models["small"], "bfloat16", None, lora_config, False), + (models["small"], "bfloat16", None, lora_config, True), + # small - 4bit + (models["small"], "float16", 4, lora_config, False), + (models["small"], "float16", 4, lora_config, True), + (models["small"], "float32", 4, lora_config, False), + (models["small"], "float32", 4, lora_config, True), + (models["small"], "bfloat16", 4, lora_config, False), + (models["small"], "bfloat16", 4, lora_config, True), + # small - 8 bit + (models["small"], "float16", 8, lora_config, False), + (models["small"], "float16", 8, lora_config, True), + (models["small"], "float32", 8, lora_config, False), + (models["small"], "float32", 8, lora_config, True), + (models["small"], "bfloat16", 8, lora_config, False), + (models["small"], "bfloat16", 8, lora_config, True), + # large + (models["large"], "bfloat16", 4, lora_config, False), + (models["large"], "bfloat16", 4, lora_config, True), + (models["large"], "float16", 4, lora_config, False), + (models["large"], "float16", 4, lora_config, True), + (models["large"], "float32", 4, lora_config, False), + (models["large"], "float32", 4, lora_config, True), + # # 4 bit + (models["4-bit"], "float16", None, lora_config, False), + # # 8 bit + # (models["8-bit"], "float16", None, lora_config, False), + # (models["8-bit"], "float16", None, lora_config, True), + ], +) +def test_language_modeling_bolt_fine_tune( + language_modeling_bolt, dataset_file, model_name, precision, quantization, lora_config, use_accelerate +): + try: + tokenizer_name = model_name + + tmpdir, ext = dataset_file + language_modeling_bolt.input.input_folder = tmpdir + + language_modeling_bolt.fine_tune( + model_name=model_name, + tokenizer_name=model_name, + model_class="AutoModelForCausalLM", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map="auto" if "GPTQ" in model_name else None, + data_masked=False, + ) + output_dir = language_modeling_bolt.output.output_folder + assert os.path.exists( + os.path.join(language_modeling_bolt.output.output_folder, "model", "pytorch_model.bin") + ) or os.path.exists(os.path.join(language_modeling_bolt.output.output_folder, "model", "adapter_model.bin")) + assert os.path.exists( + os.path.join(language_modeling_bolt.output.output_folder, "model", "config.json") + ) or os.path.exists(os.path.join(language_modeling_bolt.output.output_folder, "model", "adapter_config.json")) + assert os.path.exists(os.path.join(language_modeling_bolt.output.output_folder, "model", "training_args.bin")) + + del language_modeling_bolt.model + del language_modeling_bolt.tokenizer + torch.cuda.empty_cache() + + try: + os.remove(os.path.join(language_modeling_bolt.output.output_folder, "model", "pytorch_model.bin")) + os.remove(os.path.join(language_modeling_bolt.output.output_folder, "model", "adapter_model.bin")) + os.remove(os.path.join(language_modeling_bolt.output.output_folder, "model", "config.json")) + os.remove(os.path.join(language_modeling_bolt.output.output_folder, "model", "adapter_config.json")) + os.remove(os.path.join(language_modeling_bolt.output.output_folder, "model", "training_args.bin")) + except Exception as _: + pass + + except Exception as e: + del language_modeling_bolt.model + del language_modeling_bolt.tokenizer + torch.cuda.empty_cache() + raise + + +def test_language_modeling_bolt_compute_metrics(language_modeling_bolt, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForCausalLM" + tokenizer_class = "AutoTokenizer" + + language_modeling_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + logits = np.array([[0.6, 0.4], [0.4, 0.6]]) + labels = np.array([[0, 1], [1, 0]]) + eval_pred = EvalPrediction(predictions=logits, label_ids=labels) + metrics = language_modeling_bolt.compute_metrics(eval_pred) + assert "bleu" in metrics + assert "sacrebleu" in metrics diff --git a/geniusrise_text/ner/__init__.py b/geniusrise_text/ner/__init__.py new file mode 100644 index 0000000..beb06b5 --- /dev/null +++ b/geniusrise_text/ner/__init__.py @@ -0,0 +1,18 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import NamedEntityRecognitionAPI +from .bulk import NamedEntityRecognitionBulk +from .fine_tune import NamedEntityRecognitionFineTuner diff --git a/geniusrise_text/ner/api.py b/geniusrise_text/ner/api.py new file mode 100644 index 0000000..631b043 --- /dev/null +++ b/geniusrise_text/ner/api.py @@ -0,0 +1,176 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict + +import cherrypy +import torch +from geniusrise import BatchInput, BatchOutput, State +from geniusrise.logging import setup_logger +from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline + +from geniusrise_text.base import TextAPI + + +class NamedEntityRecognitionAPI(TextAPI): + r""" + NamedEntityRecognitionAPI serves a Named Entity Recognition (NER) model using the Hugging Face transformers library. + It is designed to recognize and classify named entities in text into predefined categories such as the names of persons, + organizations, locations, expressions of times, quantities, monetary values, percentages, etc. + + Attributes: + model (Any): The loaded NER model, typically a Hugging Face transformer model specialized for token classification. + tokenizer (Any): The tokenizer for preprocessing text compatible with the loaded model. + + Example CLI Usage: + ```bash + genius NamedEntityRecognitionAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id dslim/bert-large-NER-lol \ + listen \ + --args \ + model_name="dslim/bert-large-NER" \ + model_class="AutoModelForTokenClassification" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + endpoint="0.0.0.0" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + """ + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs: Any, + ) -> None: + """ + Initializes the NamedEntityRecognitionAPI class. + + Args: + input (BatchInput): The input data. + output (BatchOutput): The output data. + state (State): The state data. + **kwargs: Additional keyword arguments. + """ + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + self.hf_pipeline = None + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def recognize_entities(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Endpoint for recognizing named entities in the input text using the loaded NER model. + + Args: + **kwargs (Any): Arbitrary keyword arguments, typically containing 'text' for the input text. + + Returns: + Dict[str, Any]: A dictionary containing the original input text and a list of recognized entities + with their respective types. + + Example CURL Requests: + ```bash + curl -X POST localhost:3000/api/v1/recognize_entities \ + -H "Content-Type: application/json" \ + -d '{"text": "John Doe works at OpenAI in San Francisco."}' | jq + ``` + + ```bash + curl -X POST localhost:3000/api/v1/recognize_entities \ + -H "Content-Type: application/json" \ + -d '{"text": "Alice is going to visit the Eiffel Tower in Paris next summer."}' | jq + ``` + """ + data = cherrypy.request.json + text = data.get("text") + generation_args = data + + if "text" in generation_args: + del generation_args["text"] + + inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True) + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model(**inputs, **generation_args) + predictions = outputs.logits.argmax(dim=-1).squeeze().tolist() + + entities = [ + {"token": self.tokenizer.convert_ids_to_tokens(i), "class": self.model.config.id2label[x]} + for (x, i) in zip(predictions, inputs["input_ids"].squeeze().tolist()) + ] + + return {"input": text, "entities": entities} + + def initialize_pipeline(self): + """ + Lazy initialization of the NER Hugging Face pipeline. + """ + if not self.hf_pipeline: + model = AutoModelForTokenClassification.from_pretrained(self.model_name) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.hf_pipeline = pipeline("ner", model=model, tokenizer=tokenizer) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def ner_pipeline(self, **kwargs: Any) -> Dict[str, Any]: + """ + Recognizes named entities in the input text using the Hugging Face pipeline. + + This method leverages a pre-trained NER model to identify and classify entities in text into categories such as + names, organizations, locations, etc. It's suitable for processing various types of text content. + + Args: + **kwargs (Any): Arbitrary keyword arguments, typically containing 'text' for the input text. + + Returns: + Dict[str, Any]: A dictionary containing the original input text and a list of recognized entities. + + Example CURL Request for NER: + ```bash + curl -X POST localhost:3000/api/v1/ner_pipeline \ + -H "Content-Type: application/json" \ + -d '{"text": "John Doe works at OpenAI in San Francisco."}' | jq + ``` + """ + self.initialize_pipeline() # Initialize the pipeline on first API hit + + data = cherrypy.request.json + text = data.get("text") + + result = self.hf_pipeline(text) # type: ignore + + return {"input": text, "entities": result} diff --git a/geniusrise_text/ner/api.yml b/geniusrise_text/ner/api.yml new file mode 100644 index 0000000..3a9db54 --- /dev/null +++ b/geniusrise_text/ner/api.yml @@ -0,0 +1,110 @@ +openapi: 3.0.0 +info: + title: Named Entity Recognition API + description: API for Named Entity Recognition using pre-trained Hugging Face transformers models. + version: "1.0" +servers: + - url: http://localhost:3000/api/v1 + description: API server for NER model text processing +paths: + /recognize_entities: + post: + summary: Recognize named entities in the provided text + operationId: recognizeEntities + tags: + - Named Entity Recognition + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: Input text for entity recognition. + required: + - text + responses: + 200: + description: Successful response with recognized entities + content: + application/json: + schema: + type: object + properties: + input: + type: string + description: The original text that was provided for entity recognition. + entities: + type: array + items: + type: object + properties: + token: + type: string + description: Tokenized part of the text. + class: + type: string + description: Entity class identified for the token. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failure + /ner_pipeline: + post: + summary: Recognize named entities using Hugging Face NER pipeline + operationId: nerPipeline + tags: + - Named Entity Recognition + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: Input text for entity recognition using the Hugging Face pipeline. + required: + - text + responses: + 200: + description: Successful response with recognized entities + content: + application/json: + schema: + type: object + properties: + input: + type: string + description: The original text that was provided for entity recognition. + entities: + type: array + items: + type: object + properties: + entity: + type: string + description: Entity class identified. + score: + type: number + format: float + description: Confidence score for the entity classification. + index: + type: integer + description: Token index in the input sequence. + word: + type: string + description: The word classified as an entity. + start: + type: integer + description: Start position of the entity in the input text. + end: + type: integer + description: End position of the entity in the input text. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failure diff --git a/geniusrise_text/ner/bulk.py b/geniusrise_text/ner/bulk.py new file mode 100644 index 0000000..5a4efaa --- /dev/null +++ b/geniusrise_text/ner/bulk.py @@ -0,0 +1,360 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional + +import pandas as pd +import yaml # type: ignore +from datasets import Dataset, load_from_disk +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.base import TextBulk + + +class NamedEntityRecognitionBulk(TextBulk): + r""" + NamedEntityRecognitionBulk is a class designed for bulk processing of Named Entity Recognition (NER) tasks. + It leverages state-of-the-art NER models from Hugging Face's transformers library to identify and classify entities + such as person names, locations, organizations, and other types of entities from a large corpus of text. + + This class provides functionalities to load large datasets, configure NER models, and perform entity recognition + in bulk, making it suitable for processing large volumes of text data efficiently. + + Attributes: + model (Any): The NER model loaded for entity recognition tasks. + tokenizer (Any): The tokenizer used for text pre-processing in alignment with the model. + + Example CLI Usage: + ```bash + genius NamedEntityRecognitionBulk rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id dslim/bert-large-NER-lol \ + recognize_entities \ + --args \ + model_name="dslim/bert-large-NER" \ + model_class="AutoModelForTokenClassification" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs: Any) -> None: + """ + Initializes the NamedEntityRecognitionBulk class with specified input, output, and state configurations. + Sets up the NER model and tokenizer for bulk entity recognition tasks. + + Args: + input (BatchInput): The input data configuration. + output (BatchOutput): The output data configuration. + state (State): The state management for the API. + **kwargs (Any): Additional keyword arguments for extended functionality. + """ + super().__init__(input, output, state, **kwargs) + + def load_dataset(self, dataset_path: str, **kwargs: Any) -> Optional[Dataset]: + r""" + Loads a dataset from the specified directory path. The method supports various data formats and structures, + ensuring that the dataset is properly formatted for NER tasks. + + Args: + dataset_path (str): The path to the dataset directory. + **kwargs: Additional keyword arguments to handle specific dataset loading scenarios. + + Returns: + Optional[Dataset]: The loaded dataset or None if an error occurs during loading. + + ## Supported Data Formats and Structures: + + ### Hugging Face Dataset + Dataset files saved by the Hugging Face datasets library. + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"tokens": ["token1", "token2", ...]} + ``` + + ### CSV + Should contain 'tokens' columns. + ```csv + tokens + "['token1', 'token2', ...]" + ``` + + ### Parquet + Should contain 'tokens' columns. + + ### JSON + An array of dictionaries with 'tokens' keys. + ```json + [{"tokens": ["token1", "token2", ...]}] + ``` + + ### XML + Each 'record' element should contain 'tokens' child elements. + ```xml + + token1 token2 ... + + ``` + + ### YAML + Each document should be a dictionary with 'tokens' keys. + ```yaml + - tokens: ["token1", "token2", ...] + ``` + + ### TSV + Should contain 'tokens' columns separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'tokens' columns. + + ### SQLite (.db) + Should contain a table with 'tokens' columns. + + ### Feather + Should contain 'tokens' columns. + """ + self.log.info(f"Loading dataset from {dataset_path}") + try: + if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + return load_from_disk(dataset_path) + else: + data = [] + for filename in glob.glob(f"{dataset_path}/**/*", recursive=True): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + text = record.find("text").text.split() # type: ignore + data.append({"text": text}) + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT text FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + return Dataset.from_pandas(pd.DataFrame(data)) + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def recognize_entities( + self, + model_name: str, + max_length: int = 512, + model_class: str = "AutoModelForSeq2SeqLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + batch_size: int = 32, + notification_email: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Performs bulk named entity recognition on the loaded dataset. The method processes the text in batches, + applying the NER model to recognize entities. + + Args: + model_name (str): The name or path of the NER model. + max_length (int): The maximum sequence length for the tokenizer. + model_class (str): The class of the model, defaults to "AutoModelForTokenClassification". + tokenizer_class (str): The class of the tokenizer, defaults to "AutoTokenizer". + use_cuda (bool): Whether to use CUDA for model inference, defaults to False. + precision (str): Model computation precision, defaults to "float16". + quantization (int): Level of quantization for model size and speed optimization, defaults to 0. + device_map (str | Dict | None): Specific device configuration for computation, defaults to "auto". + max_memory (Dict): Maximum memory configuration for the devices. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool): Whether to enable AWQ optimization, defaults to False. + flash_attention (bool): Whether to use flash attention optimization, defaults to False. + batch_size (int): Number of documents to process simultaneously, defaults to 32. + **kwargs: Arbitrary keyword arguments for additional configuration. + + Returns: + None: The method processes the dataset and saves the predictions without returning any value. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.batch_size = batch_size + self.notification_email = notification_email + self.compile = compile + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + dataset = self.load_dataset(dataset_path) + if dataset is None: + self.log.error("Failed to load dataset.") + return + if dataset: + dataset = dataset["text"] + + # Process data in batches + for i in range(0, len(dataset), batch_size): + batch = dataset[i : i + batch_size] + inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True) + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + predictions = self.model(**inputs, **generation_args) + predictions = predictions[0] if isinstance(predictions, tuple) else predictions.logits + predictions = predictions.argmax(dim=-1).squeeze().tolist() + + self._save_predictions(inputs["input_ids"].tolist(), predictions, batch, output_path, i) + self.done() + + def _save_predictions( + self, inputs: list, predictions: list, input_batch: List[str], output_path: str, batch_idx: int + ) -> None: + """ + Saves the NER predictions to the specified output path. + + Args: + inputs (list): List of input tokens. + predictions (list): List of prediction tensors from the NER model. + input_batch (List[str]): The input text batch. + output_path (str): The path to save the prediction results. + batch_idx (int): The index of the current batch, used for naming the output files. + + Returns: + None: The method saves the predictions to files and does not return any value. + """ + # Convert tensor of label ids to list of label strings + label_predictions = [ + [ + { + "label": self.model.config.id2label[label_id], + "position": i, + "token": self.tokenizer.convert_ids_to_tokens(inp[i]), + } + for i, label_id in enumerate(pred) + ] + for pred, inp in zip(predictions, inputs) + ] + + # Prepare data for saving + data_to_save = [ + {"input": input_text, "labels": label} for input_text, label in zip(input_batch, label_predictions) + ] + with open(os.path.join(output_path, f"predictions-{batch_idx}-{str(uuid.uuid4())}.jsonl"), "w") as f: + for item in data_to_save: + f.write(json.dumps(item) + "\n") + + self.log.info(f"Saved predictions for batch {batch_idx} to {output_path}") diff --git a/huggingface/ner.py b/geniusrise_text/ner/fine_tune.py similarity index 67% rename from huggingface/ner.py rename to geniusrise_text/ner/fine_tune.py index 95da051..1f59b7f 100644 --- a/huggingface/ner.py +++ b/geniusrise_text/ner/fine_tune.py @@ -1,18 +1,17 @@ # 🧠 Geniusrise # Copyright (C) 2023 geniusrise.ai # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import ast import json @@ -27,14 +26,14 @@ import pyarrow.parquet as pq import torch import yaml # type: ignore -from datasets import Dataset, DatasetDict, load_from_disk +from datasets import Dataset, DatasetDict, load_dataset, load_from_disk from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score from transformers import DataCollatorForTokenClassification, EvalPrediction -from .base import HuggingFaceFineTuner +from geniusrise_text.base import TextFineTuner -class HuggingFaceNamedEntityRecognitionFineTuner(HuggingFaceFineTuner): +class NamedEntityRecognitionFineTuner(TextFineTuner): r""" A bolt for fine-tuning Hugging Face models on named entity recognition tasks. @@ -43,62 +42,22 @@ class HuggingFaceNamedEntityRecognitionFineTuner(HuggingFaceFineTuner): output (OutputConfig): The output data. state (State): The state manager. - ## Using geniusrise to invoke via command line - ```bash - genius HuggingFaceNamedEntityRecognitionFineTuner rise \ - batch \ - --input_bucket my_bucket \ - --input_folder my_folder \ - batch \ - --output_bucket my_output_bucket \ - --output_folder my_output_folder \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - load_dataset \ - --args dataset_path=my_dataset_path label_list="['O', 'B-PER', 'I-PER']" - ``` + CLI Usage: - ## Using geniusrise to invoke via YAML file - ```yaml - version: "1" - bolts: - my_ner_bolt: - name: "HuggingFaceNamedEntityRecognitionFineTuner" - method: "load_dataset" - args: - dataset_path: "my_dataset_path" - label_list: ["O", "B-PER", "I-PER"] - input: - type: "batch" - args: - bucket: "my_bucket" - folder: "my_folder" - output: - type: "batch" - args: - bucket: "my_output_bucket" - folder: "my_output_folder" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "geniusrise" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_ner_bolt" - namespace: "default" - image: "my_ner_bolt_image" - replicas: 1 + ```bash + genius NamedEntityRecognitionFineTuner rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id dslim/bert-large-NER-lol \ + fine_tune \ + --args \ + model_name=my_model \ + tokenizer_name=my_tokenizer \ + num_train_epochs=3 \ + per_device_train_batch_size=8 ``` """ @@ -174,12 +133,11 @@ def load_dataset( Should contain 'tokens' and 'ner_tags' columns. """ - self.label_list = label_list - self.label_to_id = {label: i for i, label in enumerate(self.label_list)} - try: self.log.info(f"Loading dataset from {dataset_path}") - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): dataset = load_from_disk(dataset_path) else: data = [] @@ -229,12 +187,57 @@ def load_dataset( dataset = Dataset.from_pandas(pd.DataFrame(data)) + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + # Preprocess the dataset - tokenized_dataset = dataset.map( - self.prepare_train_features, - batched=True, - remove_columns=dataset.column_names, - ) + self.label_list = label_list if label_list else list({y for x in dataset["train"]["ner_tags"] for y in x}) + self.label_to_id = {label: i for i, label in enumerate(self.label_list)} + if self.model: + config = self.model.config + config.label2id = self.label_to_id + config.id2label = {i: label for label, i in self.label_to_id.items()} + config.num_labels = len(self.label_to_id.keys()) + self.config = config + + self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + device_map=self.device_map, + precision=self.precision, + quantization=self.quantization, + lora_config=self.lora_config, + use_accelerate=self.use_accelerate, + accelerate_no_split_module_classes=self.accelerate_no_split_module_classes, + **self.model_kwargs, + ) + + if self.tokenizer_name.lower() == "local": # type: ignore + self.log.info(f"Loading local tokenizer : {self.tokenizer_class} : {self.input.get()}") + self.tokenizer = getattr(__import__("transformers"), str(self.tokenizer_class)).from_pretrained( + os.path.join(self.input.get(), "/model"), + add_prefix_space=True, + ) + else: + self.log.info( + f"Loading tokenizer from huggingface hub: {self.tokenizer_class} : {self.tokenizer_name}" + ) + self.tokenizer = getattr(__import__("transformers"), str(self.tokenizer_class)).from_pretrained( + self.tokenizer_name, + revision=self.tokenizer_revision, + add_prefix_space=True, + ) + + if self.tokenizer and not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + self.model.config.pad_token_id = self.tokenizer.eos_token_id + + tokenized_dataset = dataset.map(self.prepare_train_features, batched=True) return tokenized_dataset except Exception as e: @@ -274,7 +277,7 @@ def prepare_train_features( label_ids = [] for word_idx in word_ids: if word_idx is not None: - print(f"label[word_idx]: {labels[word_idx]}", self.label_to_id) # Debug print + self.log.debug(f"labels[word_idx]: {labels[word_idx]}") # type: ignore label_ids.append(self.label_to_id[labels[word_idx]]) # type: ignore else: label_ids.append(-100) diff --git a/geniusrise_text/ner/tests/__init__.py b/geniusrise_text/ner/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/ner/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/ner/tests/test_fine_tune.py b/geniusrise_text/ner/tests/test_fine_tune.py new file mode 100644 index 0000000..272b2c6 --- /dev/null +++ b/geniusrise_text/ner/tests/test_fine_tune.py @@ -0,0 +1,273 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import numpy as np +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq +from transformers import EvalPrediction + +from geniusrise_text.ner.fine_tune import NamedEntityRecognitionFineTuner + +lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", +} + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"tokens": ["This", "is", "a", "test"], "ner_tags": [0, 1, 0, 1]} for _ in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "tokens").text = " ".join(item["tokens"]) + ET.SubElement(record, "ner_tags").text = " ".join(map(str, item["ner_tags"])) + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "jsonl", + "parquet", + "json", + "xml", + "yaml", + "tsv", + "xlsx", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir + "/train", ext) + create_dataset_in_format(tmpdir + "/eval", ext) + return tmpdir, ext + + +MODELS_TO_TEST = { + # fmt: off + "small": "dslim/bert-large-NER", + # fmt: on +} + + +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + +@pytest.fixture +def ner_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + + klass = NamedEntityRecognitionFineTuner( + input=input, + output=output, + state=state, + ) + klass.model_class = "BertForTokenClassification" + klass.model_name = "bert-base-uncased" + klass.tokenizer_class = "BertTokenizerFast" + klass.tokenizer_name = "bert-base-uncased" + + return klass + + +def test_ner_bolt_init(ner_bolt, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForTokenClassification" + tokenizer_class = "AutoTokenizer" + + ner_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + assert ner_bolt.model is not None + assert ner_bolt.tokenizer is not None + assert ner_bolt.input is not None + assert ner_bolt.output is not None + assert ner_bolt.state is not None + + +def test_load_dataset_all_formats(ner_bolt, dataset_file, model): + tmpdir, ext = dataset_file + dataset_path = os.path.join(tmpdir, "train") + + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForTokenClassification" + tokenizer_class = "AutoTokenizer" + + ner_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + dataset = ner_bolt.load_dataset(dataset_path, label_list=[0, 1]) + assert dataset is not None + assert len(dataset) == 10 + + +# Models to test +models = { + # fmt: off + "bart": "dslim/bert-large-NER", + "wikineural": "Babelscape/wikineural-multilingual-ner", + "medical": "d4data/biomedical-ner-all", + "chemical": "alvaroalon2/biobert_chemical_ner", + "genetic": "pruas/BENT-PubMedBERT-NER-Gene", + "food": "Dizex/FoodBaseBERT-NER", + "disease": "pruas/BENT-PubMedBERT-NER-Disease", + # fmt: on +} + + +# Test for fine-tuning +@pytest.mark.parametrize( + "model_name, precision, quantization, lora_config, use_accelerate", + [ + # small + (models["bart"], "bfloat16", None, None, False), + (models["wikineural"], "bfloat16", None, None, False), + (models["medical"], "bfloat16", None, None, False), + (models["chemical"], "bfloat16", None, None, False), + (models["genetic"], "bfloat16", None, None, False), + (models["food"], "bfloat16", None, None, False), + (models["disease"], "bfloat16", None, None, False), + ], +) +def test_ner_bolt_fine_tune(ner_bolt, dataset_file, model_name, precision, quantization, lora_config, use_accelerate): + try: + tokenizer_name = model_name + + tmpdir, ext = dataset_file + ner_bolt.input.input_folder = tmpdir + + ner_bolt.fine_tune( + model_name=model_name, + tokenizer_name=model_name, + model_class="AutoModelForTokenClassification", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map="cuda:0", + data_label_list=[0, 1], + ) + output_dir = ner_bolt.output.output_folder + assert os.path.exists( + os.path.join(ner_bolt.output.output_folder, "model", "pytorch_model.bin") + ) or os.path.exists(os.path.join(ner_bolt.output.output_folder, "model", "adapter_model.bin")) + assert os.path.exists(os.path.join(ner_bolt.output.output_folder, "model", "config.json")) or os.path.exists( + os.path.join(ner_bolt.output.output_folder, "model", "adapter_config.json") + ) + assert os.path.exists(os.path.join(ner_bolt.output.output_folder, "model", "training_args.bin")) + + del ner_bolt.model + del ner_bolt.tokenizer + torch.cuda.empty_cache() + + try: + os.remove(os.path.join(ner_bolt.output.output_folder, "model", "pytorch_model.bin")) + os.remove(os.path.join(ner_bolt.output.output_folder, "model", "adapter_model.bin")) + os.remove(os.path.join(ner_bolt.output.output_folder, "model", "config.json")) + os.remove(os.path.join(ner_bolt.output.output_folder, "model", "adapter_config.json")) + os.remove(os.path.join(ner_bolt.output.output_folder, "model", "training_args.bin")) + except Exception as _: + pass + + except Exception as e: + del ner_bolt.model + del ner_bolt.tokenizer + torch.cuda.empty_cache() + raise + + +# Test for computing metrics +def test_ner_bolt_compute_metrics(ner_bolt): + logits = np.array([[0.6, 0.4], [0.4, 0.6]]) + labels = np.array([0, 1]) + eval_pred = EvalPrediction(predictions=logits, label_ids=labels) + metrics = ner_bolt.compute_metrics(eval_pred) + assert "accuracy" in metrics + assert "precision" in metrics + assert "recall" in metrics + assert "f1" in metrics diff --git a/geniusrise_text/nli/__init__.py b/geniusrise_text/nli/__init__.py new file mode 100644 index 0000000..c242d09 --- /dev/null +++ b/geniusrise_text/nli/__init__.py @@ -0,0 +1,18 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import NLIAPI +from .bulk import NLIBulk +from .fine_tune import NLIFineTuner diff --git a/geniusrise_text/nli/api.py b/geniusrise_text/nli/api.py new file mode 100644 index 0000000..30206bc --- /dev/null +++ b/geniusrise_text/nli/api.py @@ -0,0 +1,418 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List +import numpy as np +import cherrypy +import torch +from geniusrise import BatchInput, BatchOutput, State +from geniusrise.logging import setup_logger +from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline + +from geniusrise_text.base import TextAPI + + +class NLIAPI(TextAPI): + r""" + Represents a Natural Language Inference (NLI) API leveraging Hugging Face's transformer models. This class is capable of + handling various NLI tasks such as entailment, classification, similarity checking, and more. Utilizes CherryPy for exposing + API endpoints that can be interacted with via standard HTTP requests. + + Attributes: + model (AutoModelForSequenceClassification): The loaded Hugging Face model for sequence classification tasks. + tokenizer (AutoTokenizer): The tokenizer corresponding to the model, used for processing input text. + + CLI Usage Example: + For interacting with the NLI API, you would typically start the server using a command similar to one listed in the provided examples. + After the server is running, you can use CURL commands to interact with the different endpoints. + + Example: + + ```bash + genius NLIAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7-lol" \ + listen \ + --args \ + model_name="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" \ + model_class="AutoModelForSequenceClassification" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + """ + + model: Any + tokenizer: Any + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs: Any, + ): + """ + Initializes the NLIAPI with configurations for handling input, output, and state management. + + Args: + input (BatchInput): Configuration for the input data. + output (BatchOutput): Configuration for the output data. + state (State): State management for the API. + **kwargs (Any): Additional keyword arguments for extended functionality. + """ + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + self.hf_pipeline = None + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def entailment(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Endpoint for evaluating the entailment relationship between a premise and a hypothesis. It returns the relationship + scores across possible labels like entailment, contradiction, and neutral. + + Args: + **kwargs (Any): Arbitrary keyword arguments, typically containing 'premise' and 'hypothesis'. + + Returns: + Dict[str, Any]: A dictionary containing the premise, hypothesis, and their relationship scores. + + Example CURL Request: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/entailment \ + -H "Content-Type: application/json" \\\ + -d '{ + "premise": "This a very good entry level smartphone, battery last 2-3 days after fully charged when connected to the internet. No memory lag issue when playing simple hidden object games. Performance is beyond my expectation, i bought it with a good bargain, couldnt ask for more!", + "hypothesis": "the phone has an awesome battery life" + }' | jq + ``` + ``` + """ + data = cherrypy.request.json + premise = data.get("premise", "") + hypothesis = data.get("hypothesis", "The statement is true") + + inputs = self.tokenizer( + premise, + hypothesis, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model(**inputs) + logits = outputs.logits if hasattr(outputs, "logits") else outputs[0] + if next(self.model.parameters()).is_cuda: + logits = logits.cpu() + softmax = torch.nn.functional.softmax(logits, dim=-1) + scores = softmax.numpy().tolist() # Convert scores to list + + id_to_label = dict(enumerate(self.model.config.id2label.values())) # type: ignore + label_scores = {id_to_label[label_id]: score for label_id, score in enumerate(scores[0])} + + return { + "premise": premise, + "hypothesis": hypothesis, + "label_scores": label_scores, + } + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def classify(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Endpoint for classifying the input text into one of the provided candidate labels using zero-shot classification. + + Args: + **kwargs (Any): Arbitrary keyword arguments, typically containing 'text' and 'candidate_labels'. + + Returns: + Dict[str, Any]: A dictionary containing the input text, candidate labels, and classification scores. + + Example CURL Request: + ```bash + curl -X POST localhost:3000/api/v1/classify \ + -H "Content-Type: application/json" \ + -d '{ + "text": "The new movie is a thrilling adventure in space", + "candidate_labels": ["entertainment", "politics", "business"] + }' + ``` + """ + data = cherrypy.request.json + text = data.get("text", "") + candidate_labels = data.get("candidate_labels", []) + + label_scores = {} + for label in candidate_labels: + # Construct hypothesis for each label + hypothesis = f"This example is {label}." + + # Tokenize the text and hypothesis + inputs = self.tokenizer(text, hypothesis, return_tensors="pt", padding=True, truncation=True) + + # Move inputs to GPU if CUDA is enabled + if self.use_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + # Perform inference + with torch.no_grad(): + outputs = self.model(**inputs) + logits = outputs.logits + softmax = torch.nn.functional.softmax(logits, dim=-1) + scores = softmax.cpu().numpy().tolist() + + # Consider 'entailment' score as the label score + entailment_idx = self.model.config.label2id.get("entailment", 0) + contradiction_idx = self.model.config.label2id.get("contradiction", 0) + label_scores[label] = np.exp(scores[0][entailment_idx]) / np.exp( + scores[0][entailment_idx] + scores[0][contradiction_idx] + ) + + sum_scores = sum(label_scores.values()) + label_scores = {k: v / sum_scores for k, v in label_scores.items()} + return {"text": text, "label_scores": label_scores} + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def textual_similarity(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Evaluates the textual similarity between two texts. + + Args: + text1 (str): The first text. + text2 (str): The second text. + + Returns: + Dict[str, Any]: A dictionary containing similarity score. + + Example CURL Request: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/textual_similarity \ + -H "Content-Type: application/json" \ + -d '{ + "text1": "Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me.", + "text2": "There is something magical about training neural networks. Their simplicity coupled with their power is astonishing." + }' | jq + ``` + """ + data = cherrypy.request.json + text1 = data.get("text1", "") + text2 = data.get("text2", "") + + # Using the same text as premise and hypothesis for similarity + scores = self._get_entailment_scores(text1, [text2]) + return {"text1": text1, "text2": text2, "similarity_score": scores[text2]} + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def fact_checking(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Performs fact checking on a statement given a context. + + Args: + context (str): The context or background information. + statement (str): The statement to fact check. + + Returns: + Dict[str, Any]: A dictionary containing fact checking scores. + + Example CURL Request: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/fact_checking \ + -H "Content-Type: application/json" \ + -d '{ + "context": "Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me.", + "statement": "The author is looking for a home loan" + }' | jq + ``` + """ + data = cherrypy.request.json + context = data.get("context", "") + statement = data.get("statement", "") + + scores = self._get_entailment_scores(context, [statement]) + return { + "context": context, + "statement": statement, + "fact_checking_score": scores[statement], + } + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def question_answering(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Performs question answering for multiple choice questions. + + Args: + question (str): The question text. + choices (List[str]): A list of possible answers. + + Returns: + Dict[str, Any]: A dictionary containing the scores for each answer choice. + + Example CURL Request: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/question_answering \ + -H "Content-Type: application/json" \ + -d '{ + "question": "[ML-1T-2] is the dimensional formula of", + "choices": ["force", "coefficient of friction", "modulus of elasticity", "energy"] + }' | jq + ``` + """ + data = cherrypy.request.json + question = data.get("question", "") + choices = data.get("choices", []) + + scores = self._get_entailment_scores(question, choices) + return {"question": question, "choices": choices, "scores": scores} + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def detect_intent(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Detects the intent of the input text from a list of possible intents. + + Args: + text (str): The input text. + intents (List[str]): A list of possible intents. + + Returns: + Dict[str, Any]: A dictionary containing the input text and detected intent with its score. + + Example CURL Request: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/detect_intent \ + -H "Content-Type: application/json" \ + -d '{ + "text": "Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me.", + "intents": ["teach","sell","note","advertise","promote"] + }' | jq + ``` + """ + data = cherrypy.request.json + text = data.get("text", "") + intents = data.get("intents", []) + + # Zero-shot classification for intent detection + scores = self._get_entailment_scores(text, intents) + return {"text": text, "intents": intents, "scores": scores} + + def _get_entailment_scores(self, premise: str, hypotheses: List[str]) -> Dict[str, float]: + """ + Helper method to get entailment scores for multiple hypotheses. + + Args: + premise (str): The input premise text. + hypotheses (List[str]): A list of hypothesis texts. + + Returns: + Dict[str, float]: A dictionary mapping each hypothesis to its entailment score. + """ + label_scores = {} + for hypothesis in hypotheses: + inputs = self.tokenizer(premise, hypothesis, return_tensors="pt", padding=True, truncation=True) + if self.use_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model(**inputs) + logits = outputs.logits + softmax = torch.nn.functional.softmax(logits, dim=-1) + scores = softmax.cpu().numpy().tolist() + + entailment_idx = self.model.config.label2id.get("entailment", 0) + contradiction_idx = self.model.config.label2id.get("contradiction", 0) + label_scores[hypothesis] = np.exp(scores[0][entailment_idx]) / np.exp( + scores[0][entailment_idx] + scores[0][contradiction_idx] + ) + + return label_scores + + def initialize_pipeline(self): + """ + Lazy initialization of the NLI Hugging Face pipeline. + """ + if not self.hf_pipeline: + model = AutoModelForSequenceClassification.from_pretrained(self.model_name) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.hf_pipeline = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def zero_shot_classification(self, **kwargs: Any) -> Dict[str, Any]: + """ + Performs zero-shot classification using the Hugging Face pipeline. + It allows classification of text without explicitly provided labels. + + Args: + **kwargs (Any): Arbitrary keyword arguments, typically containing 'premise' and 'hypothesis'. + + Returns: + Dict[str, Any]: A dictionary containing the premise, hypothesis, and their classification scores. + + Example CURL Request for zero-shot classification: + ```bash + curl -X POST localhost:3000/api/v1/zero_shot_classification \ + -H "Content-Type: application/json" \ + -d '{ + "premise": "A new study shows that the Mediterranean diet is good for heart health.", + "hypothesis": "The study is related to diet and health." + }' | jq + ``` + """ + self.initialize_pipeline() # Initialize the pipeline on first API hit + + data = cherrypy.request.json + premise = data.get("premise", "") + hypothesis = data.get("hypothesis", "") + + result = self.hf_pipeline( # type: ignore + premise, candidate_labels=["entailment", "contradiction", "neutral"], hypothesis=hypothesis + ) + + return {"premise": premise, "hypothesis": hypothesis, "label_scores": result["scores"]} diff --git a/geniusrise_text/nli/api.yml b/geniusrise_text/nli/api.yml new file mode 100644 index 0000000..6427059 --- /dev/null +++ b/geniusrise_text/nli/api.yml @@ -0,0 +1,188 @@ +openapi: 3.0.0 +info: + title: Natural Language Inference API + description: API for performing various Natural Language Inference tasks using pre-trained Hugging Face models. + version: "1.0" +servers: + - url: http://localhost:3000/api/v1 + description: API server for NLI model text processing +paths: + /entailment: + post: + summary: Evaluate entailment relationship between a premise and a hypothesis + operationId: entailment + tags: + - Natural Language Inference + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + premise: + type: string + description: Input premise text. + hypothesis: + type: string + description: Input hypothesis text to evaluate against the premise. + required: + - premise + - hypothesis + responses: + 200: + description: Successful response with entailment scores + content: + application/json: + schema: + type: object + properties: + premise: + type: string + description: The original premise text. + hypothesis: + type: string + description: The original hypothesis text. + label_scores: + type: object + additionalProperties: + type: number + format: float + description: Scores for entailment, contradiction, and neutral. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failure + /classify: + post: + summary: Classify the input text into one of the candidate labels + operationId: classify + tags: + - Natural Language Inference + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: Input text for classification. + candidate_labels: + type: array + items: + type: string + description: A list of candidate labels for classification. + required: + - text + - candidate_labels + responses: + 200: + description: Successful response with classification scores + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The original text that was provided for classification. + label_scores: + type: object + additionalProperties: + type: number + format: float + description: Scores for each candidate label. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failure + /textual_similarity: + post: + summary: Evaluate textual similarity between two texts + operationId: textualSimilarity + tags: + - Natural Language Inference + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text1: + type: string + description: The first text for similarity comparison. + text2: + type: string + description: The second text for similarity comparison. + required: + - text1 + - text2 + responses: + 200: + description: Successful response with similarity score + content: + application/json: + schema: + type: object + properties: + text1: + type: string + description: The first text provided for comparison. + text2: + type: string + description: The second text provided for comparison. + similarity_score: + type: number + format: float + description: Similarity score between the two texts. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failure + /fact_checking: + post: + summary: Perform fact checking on a statement given a context + operationId: factChecking + tags: + - Natural Language Inference + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + context: + type: string + description: The context or background information. + statement: + type: string + description: The statement to fact check. + required: + - context + - statement + responses: + 200: + description: Successful response with fact checking scores + content: + application/json: + schema: + type: object + properties: + context: + type: string + description: The context provided for fact checking. + statement: + type: string + description: The statement that was fact checked. + fact_checking_score: + type: number + format: float + description: Fact checking score for the statement. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failure diff --git a/geniusrise_text/nli/bulk.py b/geniusrise_text/nli/bulk.py new file mode 100644 index 0000000..59552e7 --- /dev/null +++ b/geniusrise_text/nli/bulk.py @@ -0,0 +1,368 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, Optional + +import pandas as pd +import pyarrow.parquet as pq +import torch +import yaml # type: ignore +from datasets import Dataset, load_from_disk +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather + +from geniusrise_text.base import TextBulk + + +class NLIBulk(TextBulk): + r""" + The NLIBulk class provides functionality for large-scale natural language inference (NLI) processing using Hugging Face + transformers. It allows users to load datasets, configure models, and perform inference on batches of premise-hypothesis pairs. + + Attributes: + input (BatchInput): Configuration and data inputs for the batch process. + output (BatchOutput): Configurations for output data handling. + state (State): State management for the inference task. + + Example CLI Usage: + ```bash + genius NLIBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/nli \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/nli \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7-lol \ + infer \ + --args \ + model_name="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" \ + model_class="AutoModelForSequenceClassification" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + """ + Initializes the NLIBulk class with the specified input, output, and state configurations. + + Args: + input (BatchInput): The input data. + output (BatchOutput): The output data. + state (State): The state data. + **kwargs: Additional keyword arguments. + """ + super().__init__(input, output, state, **kwargs) + + def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs) -> Optional[Dataset]: + r""" + Load a commonsense reasoning dataset from a directory. + + Args: + dataset_path (str): The path to the dataset directory or file. + max_length (int, optional): Maximum length of text sequences for tokenization purposes. Defaults to 512. + **kwargs: Additional keyword arguments. + + Returns: + Dataset: The loaded dataset. + + Raises: + Exception: If there was an error loading the dataset. + + ## Supported Data Formats and Structures: + + ### Hugging Face Dataset + Dataset files saved by the Hugging Face datasets library. + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"premise": "The premise text", "hypothesis": "The hypothesis text"} + ``` + + ### CSV + Should contain 'premise' and 'hypothesis' columns. + ```csv + premise,hypothesis + "The premise text","The hypothesis text" + ``` + + ### Parquet + Should contain 'premise' and 'hypothesis' columns. + + ### JSON + An array of dictionaries with 'premise' and 'hypothesis' keys. + ```json + [{"premise": "The premise text", "hypothesis": "The hypothesis text"}] + ``` + + ### XML + Each 'record' element should contain 'premise' and 'hypothesis' child elements. + ```xml + + The premise text + The hypothesis text + + ``` + + ### YAML + Each document should be a dictionary with 'premise' and 'hypothesis' keys. + ```yaml + - premise: "The premise text" + hypothesis: "The hypothesis text" + ``` + + ### TSV + Should contain 'premise' and 'hypothesis' columns separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'premise' and 'hypothesis' columns. + + ### SQLite (.db) + Should contain a table with 'premise' and 'hypothesis' columns. + + ### Feather + Should contain 'premise' and 'hypothesis' columns. + """ + self.max_length = max_length + + self.log.info(f"Loading dataset from {dataset_path}") + try: + if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + dataset = load_from_disk(dataset_path) + else: + data = [] + for filename in glob.glob(f"{dataset_path}/**/*", recursive=True): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + premise = record.find("premise").text # type: ignore + hypothesis = record.find("hypothesis").text # type: ignore + data.append({"premise": premise, "hypothesis": hypothesis}) + + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT premise, hypothesis FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + return Dataset.from_pandas(pd.DataFrame(data)) + + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def infer( + self, + model_name: str, + max_length: int = 512, + model_class: str = "AutoModelForSeq2SeqLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + batch_size: int = 32, + notification_email: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Performs NLI inference on a loaded dataset using the specified model. The method processes the data in batches and saves + the results to the configured output path. + + Args: + model_name (str): Name or path of the NLI model. + max_length (int, optional): Maximum length of the sequences for tokenization purposes. Defaults to 512. + model_class (str, optional): Class name of the model (e.g., "AutoModelForSequenceClassification"). Defaults to "AutoModelForSeq2SeqLM". + tokenizer_class (str, optional): Class name of the tokenizer (e.g., "AutoTokenizer"). Defaults to "AutoTokenizer". + use_cuda (bool, optional): Whether to use CUDA for model inference. Defaults to False. + precision (str, optional): Precision for model computation (e.g., "float16"). Defaults to "float16". + quantization (int, optional): Level of quantization for optimizing model size and speed. Defaults to 0. + device_map (str | Dict | None, optional): Specific device to use for computation. Defaults to "auto". + max_memory (Dict, optional): Maximum memory configuration for devices. Defaults to {0: "24GB"}. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool, optional): Whether to enable AWQ optimization. Defaults to False. + flash_attention (bool, optional): Whether to use flash attention optimization. Defaults to False. + batch_size (int, optional): Number of premise-hypothesis pairs to process simultaneously. Defaults to 32. + **kwargs: Arbitrary keyword arguments for model and generation configurations. + ``` + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.batch_size = batch_size + self.notification_email = notification_email + self.compile = compile + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + dataset = self.load_dataset(dataset_path) + if dataset is None: + self.log.error("Failed to load dataset.") + return + + predictions = [] + for i in range(0, len(dataset), batch_size): + batch = dataset[i : i + batch_size] + + inputs = self.tokenizer( + batch["premise"], + batch["hypothesis"], + padding=True, + return_tensors="pt", + ) + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model(**inputs) + logits = outputs.logits if hasattr(outputs, "logits") else outputs[0] + if next(self.model.parameters()).is_cuda: + logits = logits.cpu() + softmax = torch.nn.functional.softmax(logits, dim=-1) + scores = softmax.numpy().tolist() + + for score in scores: + label_scores = { + self.model.config.id2label[label_id]: score for label_id, score in enumerate(scores[0]) + } + predictions.append(label_scores) + + # Save results + self.log.info(f"Saving results to {output_path}") + os.makedirs(output_path, exist_ok=True) + output_file = os.path.join(output_path, f"nli_results_{uuid.uuid4().hex}.jsonl") + with open(output_file, "w") as f: + for data, pred in zip(dataset, predictions): + result = { + "premise": data["premise"], + "hypothesis": data["hypothesis"], + "prediction": pred, + } + f.write(json.dumps(result) + "\n") + + self.done() + self.log.info("Inference completed.") diff --git a/huggingface/commonsense_reasoning.py b/geniusrise_text/nli/fine_tune.py similarity index 64% rename from huggingface/commonsense_reasoning.py rename to geniusrise_text/nli/fine_tune.py index bb48f3b..9695838 100644 --- a/huggingface/commonsense_reasoning.py +++ b/geniusrise_text/nli/fine_tune.py @@ -1,18 +1,17 @@ # 🧠 Geniusrise # Copyright (C) 2023 geniusrise.ai # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import os @@ -23,80 +22,41 @@ import pandas as pd import pyarrow.parquet as pq import yaml # type: ignore -from datasets import Dataset, DatasetDict, load_from_disk +from datasets import Dataset, DatasetDict, load_dataset, load_from_disk from pyarrow import feather from transformers import DataCollatorWithPadding -from .base import HuggingFaceFineTuner +from geniusrise_text.base import TextFineTuner -class HuggingFaceCommonsenseReasoningFineTuner(HuggingFaceFineTuner): +class NLIFineTuner(TextFineTuner): r""" - A bolt for fine-tuning Hugging Face models on commonsense reasoning tasks. + A bolt for fine-tuning Hugging Face models for text classification tasks. + + This class extends the `TextFineTuner` and specializes in fine-tuning models for text classification. + It provides additional functionalities for loading and preprocessing text classification datasets in various formats. Args: input (BatchInput): The batch input data. output (OutputConfig): The output data. state (State): The state manager. - ## Using geniusrise to invoke via command line - ```bash - genius HuggingFaceCommonsenseReasoningFineTuner rise \ - streaming \ - --input_kafka_topic commonsense_test \ - --input_kafka_cluster_connection_string localhost:9094 \ - --input_kafka_consumer_group_id commonsense_group \ - streaming \ - --output_kafka_topic commonsense_output \ - --output_kafka_cluster_connection_string localhost:9094 \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database commonsense_db \ - --postgres_table state \ - load_dataset \ - --args dataset_path=my_dataset max_length=512 - ``` + CLI Usage: - ## Using geniusrise to invoke via YAML file - ```yaml - version: "1" - bolts: - my_commonsense_bolt: - name: "HuggingFaceCommonsenseReasoningFineTuner" - method: "load_dataset" - args: - dataset_path: "my_dataset" - max_length: 512 - input: - type: "streaming" - args: - input_topic: "commonsense_test" - kafka_servers: "localhost:9094" - group_id: "commonsense_group" - output: - type: "streaming" - args: - output_topic: "commonsense_output" - kafka_servers: "localhost:9094" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "commonsense_db" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_commonsense_bolt" - namespace: "default" - image: "my_commonsense_bolt_image" - replicas: 1 + ```bash + genius NLIFineTuner rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7-lol + fine_tune \ + --args \ + model_name=my_model \ + tokenizer_name=my_tokenizer \ + num_train_epochs=3 \ + per_device_train_batch_size=8 ``` """ @@ -173,13 +133,10 @@ def load_dataset(self, dataset_path: str, **kwargs: Any) -> Union[Dataset, Datas """ try: - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): dataset = load_from_disk(dataset_path) - return dataset.map( - self.prepare_train_features, - batched=True, - remove_columns=dataset.column_names, - ) else: data = [] for filename in os.listdir(dataset_path): @@ -227,14 +184,44 @@ def load_dataset(self, dataset_path: str, **kwargs: Any) -> Union[Dataset, Datas data.extend(df.to_dict("records")) dataset = Dataset.from_pandas(pd.DataFrame(data)) - return dataset.map( - self.prepare_train_features, - batched=True, - remove_columns=dataset.column_names, + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + + # Create label_to_id mapping and save it in model config + # TODO: ugly shit cause we dont know num labels before we process the data but need tokenizer to process data + self.label_to_id = {label: i for i, label in enumerate(set(dataset["train"]["label"]))} + if self.model: + config = self.model.config + config.label2id = self.label_to_id + config.id2label = {i: label for label, i in self.label_to_id.items()} + config.num_labels = len(self.label_to_id.keys()) + self.config = config + + self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + device_map=self.device_map, + precision=self.precision, + quantization=self.quantization, + lora_config=self.lora_config, + use_accelerate=self.use_accelerate, + accelerate_no_split_module_classes=self.accelerate_no_split_module_classes, + **self.model_kwargs, ) + if self.tokenizer and not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + self.model.config.pad_token_id = self.tokenizer.eos_token_id + + return dataset.map(self.prepare_train_features, batched=True) except Exception as e: - print(f"Error loading dataset: {e}") + self.log.exception(f"Error loading dataset: {e}") raise def prepare_train_features(self, examples: Dict) -> Dict: @@ -256,7 +243,7 @@ def prepare_train_features(self, examples: Dict) -> Dict: examples["premise"], examples["hypothesis"], truncation=True, - padding=False, + padding=True, ) # Prepare the labels @@ -264,7 +251,7 @@ def prepare_train_features(self, examples: Dict) -> Dict: return tokenized_inputs except Exception as e: - print(f"Error preparing train features: {e}") + self.log.exception(f"Error preparing train features: {e}") raise def data_collator(self, examples: Dict) -> Dict: @@ -281,5 +268,5 @@ def data_collator(self, examples: Dict) -> Dict: return DataCollatorWithPadding(self.tokenizer)(examples) except Exception as e: - print(f"Error in data collation: {e}") + self.log.exception(f"Error in data collation: {e}") raise diff --git a/geniusrise_text/nli/intents/__init__.py b/geniusrise_text/nli/intents/__init__.py new file mode 100644 index 0000000..3e22c45 --- /dev/null +++ b/geniusrise_text/nli/intents/__init__.py @@ -0,0 +1,53 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .artisan_marketplace import intents as artisan_marketplace_intent +from .automotive import intents as automotive_intent +from .banking import intents as banking_intent +from .business_directory import intents as business_directory_intent +from .childcare import intents as childcare_intent +from .cultural import intents as cultural_intent +from .customer_support import intents as customer_support_intent +from .diet import intents as diet_intent +from .ecommerce import intents as ecommerce_intent +from .education import intents as education_intent +from .employee_helpdesk import intents as employee_helpdesk_intent +from .environment import intents as environment_intent +from .esoteric import intents as esoteric_intent +from .events import intents as events_intent +from .fitness import intents as fitness_intent +from .food_ordering import intents as food_ordering_intent +from .freelancing import intents as freelancing_intent +from .gardening import intents as gardening_intent +from .government import intents as government_intent +from .healthcare import intents as healthcare_intent +from .hobby import intents as hobby_intent +from .library import intents as library_intent +from .news import intents as news_intent +from .non_profit import intents as non_profit_intent +from .performing_arts import intents as performing_arts_intent +from .personal_finance import intents as personal_finance_intent +from .pet_care import intents as pet_care_intent +from .public_safety import intents as public_safety_intent +from .real_estate import intents as real_estate_intent +from .religion import intents as religion_intent +from .senior_care import intents as senior_care_intent +from .social_services import intents as social_services_intent +from .sports_centers import intents as sports_centers_intent +from .tourist_guide import intents as tourist_guide_intent +from .translation import intents as translation_intent +from .travel import intents as travel_intent +from .wedding import intents as wedding_intent +from .wellness import intents as wellness_intent diff --git a/geniusrise_text/nli/intents/artisan_marketplace.py b/geniusrise_text/nli/intents/artisan_marketplace.py new file mode 100644 index 0000000..47d563e --- /dev/null +++ b/geniusrise_text/nli/intents/artisan_marketplace.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Product Information": [ + "Details about specific artisan products", + "Information on materials and craftsmanship", + "Customization options for handmade items", + "Care and maintenance of craft products", + ], + "Purchasing and Ordering": [ + "How to purchase or place an order", + "Payment options and security", + "Order customization and special requests", + "Bulk orders and wholesale purchasing", + ], + "Marketplace Events and Fairs": [ + "Upcoming craft fairs and markets", + "Participating artisan profiles", + "Workshops and live demonstrations", + "Event locations and schedules", + ], + "Shipping and Delivery": [ + "Shipping options and costs", + "Tracking and status of orders", + "International shipping policies", + "Packaging and gift wrapping services", + ], + "Vendor and Artisan Support": [ + "Becoming a vendor or artisan in the marketplace", + "Vendor account management", + "Marketing and promotion support", + "Community and network for artisans", + ], + "Returns and Refunds": [ + "Return policy for products", + "Process for returning an item", + "Refund timelines and methods", + "Handling damaged or incorrect items", + ], + "Customer Reviews and Feedback": [ + "Submitting product reviews", + "Reading customer testimonials", + "Feedback on marketplace experience", + "Responding to reviews and feedback", + ], + "Sustainability and Ethics": [ + "Eco-friendly and sustainable products", + "Ethical sourcing and production practices", + "Supporting local artisans and communities", + "Recycled and upcycled craft items", + ], + "Collaborations and Partnerships": [ + "Collaborative projects with artisans", + "Cross-promotions with other brands", + "Corporate and custom gift inquiries", + "Partnership opportunities for events", + ], + "Gift Ideas and Recommendations": [ + "Gift recommendations for special occasions", + "Personalized and bespoke gift options", + "Gift cards and vouchers", + "Seasonal and holiday gift guides", + ], + "Workshops and Education": [ + "Craft workshops and classes", + "Online tutorials and courses", + "Educational resources for crafters", + "Skill-sharing and learning opportunities", + ], + "Trends and Inspirations": [ + "Current trends in handmade crafts", + "Featured artisan stories and inspirations", + "New arrivals and featured collections", + "Creative ideas and crafting inspiration", + ], +} diff --git a/geniusrise_text/nli/intents/automotive.py b/geniusrise_text/nli/intents/automotive.py new file mode 100644 index 0000000..a35463c --- /dev/null +++ b/geniusrise_text/nli/intents/automotive.py @@ -0,0 +1,83 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Vehicle Information": [ + "Car model specifications", + "Electric vehicle options", + "Fuel efficiency details", + "Safety feature information", + ], + "Pricing and Financing": [ + "Vehicle pricing inquiries", + "Financing options", + "Lease vs buy information", + "Trade-in value questions", + ], + "Test Drives and Showroom Visits": [ + "Schedule a test drive", + "Locate a dealership", + "Showroom visiting hours", + "Virtual vehicle tour requests", + ], + "Special Offers and Promotions": [ + "Current sales promotions", + "Seasonal discounts", + "Loyalty program benefits", + "First-time buyer offers", + ], + "Customization and Accessories": [ + "Custom build options", + "Accessory packages", + "Color and trim customization", + "Performance upgrades", + ], + "Warranty and Services": [ + "Vehicle warranty details", + "Extended warranty options", + "Maintenance services", + "Roadside assistance coverage", + ], + "Pre-Owned Vehicles": [ + "Certified pre-owned details", + "Used car inventory", + "Vehicle history reports", + "Pre-owned pricing", + ], + "Technology and Innovations": [ + "Infotainment system features", + "Autonomous driving capabilities", + "Connectivity options", + "Advanced safety technologies", + ], + "Environmental and Sustainability": [ + "Hybrid and electric vehicles", + "Emission standards information", + "Sustainable manufacturing practices", + "Recycling programs", + ], + "Regulatory and Compliance": [ + "Vehicle registration process", + "Emission test information", + "Safety compliance", + "Import and export regulations", + ], + "Delivery and Pickup": [ + "Vehicle delivery options", + "Order tracking", + "Pickup scheduling", + "Delivery timelines", + ], +} diff --git a/geniusrise_text/nli/intents/banking.py b/geniusrise_text/nli/intents/banking.py new file mode 100644 index 0000000..0c0e83c --- /dev/null +++ b/geniusrise_text/nli/intents/banking.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Account Information": [ + "Check account balance", + "Recent transactions", + "Account statement request", + "Update account details", + ], + "Payments and Transfers": [ + "Transfer funds", + "Bill payments", + "International wire transfer", + "Set up standing order", + ], + "Credit and Debit Cards": [ + "Apply for a new card", + "Report lost or stolen card", + "Card activation", + "Card limit inquiry", + ], + "Loans and Mortgages": [ + "Loan application process", + "Mortgage options", + "Loan repayment schedule", + "Interest rate information", + ], + "Investment Services": [ + "Investment advice", + "Stock market information", + "Setting up a brokerage account", + "Retirement planning", + ], + "Fraud and Security": [ + "Report suspicious activity", + "Fraudulent transaction inquiry", + "Update security settings", + "Identity theft assistance", + ], + "Digital Banking Assistance": [ + "Online banking setup", + "Mobile app support", + "Digital wallet services", + "Technical support for online services", + ], + "Savings and Deposits": [ + "Open a savings account", + "Fixed deposit schemes", + "Interest rates on savings", + "Withdrawal limits", + ], + "Customer Support": [ + "Branch locations", + "Contact customer service", + "Schedule an appointment", + "Feedback and complaints", + ], + "Foreign Currency Services": [ + "Foreign exchange rates", + "Travel currency cards", + "Currency exchange services", + "International banking", + ], + "Insurance Services": [ + "Insurance product information", + "File an insurance claim", + "Insurance policy renewal", + "Insurance premium payment", + ], + "Retirement Planning": [ + "Pension plan options", + "Retirement savings account", + "Early retirement effects", + "Retirement benefits", + ], +} diff --git a/geniusrise_text/nli/intents/business_directory.py b/geniusrise_text/nli/intents/business_directory.py new file mode 100644 index 0000000..247cbc8 --- /dev/null +++ b/geniusrise_text/nli/intents/business_directory.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Business Search and Information": [ + "Finding specific types of businesses", + "Business contact information and hours", + "Services offered by local businesses", + "Ratings and reviews of businesses", + ], + "Recommendations and Referrals": [ + "Personalized business recommendations", + "Referrals for specific services", + "Top-rated local businesses", + "Suggestions based on user preferences", + ], + "Event and Activity Listings": [ + "Local events and community activities", + "Business-related events and workshops", + "Networking events and meetups", + "Cultural and entertainment events", + ], + "Discounts and Special Offers": [ + "Current promotions and discounts", + "Loyalty programs and customer rewards", + "Special offers for local services", + "Coupons and deals from local businesses", + ], + "Community and Support Services": [ + "Community support and resource centers", + "Local non-profit organizations", + "Public services and facilities", + "Support groups and counseling services", + ], + "New Business Announcements": [ + "Newly opened businesses", + "Business relocation and changes", + "Grand opening events", + "Business expansion news", + ], + "Maps and Navigation": [ + "Directions to local businesses", + "Interactive maps of business locations", + "Public transportation options", + "Parking information for business areas", + ], + "Local Product and Service Inquiries": [ + "Availability of specific products or services", + "Local artisans and handmade products", + "Service providers for home and personal needs", + "Specialized and niche market services", + ], + "User Reviews and Contributions": [ + "Submitting reviews and ratings", + "User-generated content and tips", + "Community recommendations", + "Feedback on business directory services", + ], + "Business Networking and Collaboration": [ + "Local business networking groups", + "Collaboration opportunities", + "B2B services and partnerships", + "Community business initiatives", + ], + "Market Trends and Insights": [ + "Local market trends and analysis", + "Consumer behavior insights", + "Emerging business opportunities", + "Economic impact on local businesses", + ], + "Accessibility and Inclusivity": [ + "Accessible business services", + "Inclusive business practices", + "Diversity in local business community", + "Services for diverse customer needs", + ], +} diff --git a/geniusrise_text/nli/intents/childcare.py b/geniusrise_text/nli/intents/childcare.py new file mode 100644 index 0000000..c1c277e --- /dev/null +++ b/geniusrise_text/nli/intents/childcare.py @@ -0,0 +1,83 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Childcare Services": [ + "Finding local childcare", + "Childcare service options", + "Childcare costs and subsidies", + "After-school care programs", + ], + "Parenting Advice": [ + "Newborn care tips", + "Handling toddler behavior", + "Teen parenting guidance", + "Balancing work and parenting", + ], + "Health and Nutrition": [ + "Child nutrition and diet", + "Vaccination schedules", + "Dealing with common illnesses", + "Mental health resources for children", + ], + "Education and Learning": [ + "Early childhood education", + "School enrollment procedures", + "Home-schooling resources", + "Learning disability support", + ], + "Activities and Recreation": [ + "Family-friendly activities", + "Children's clubs and classes", + "Outdoor and educational games", + "Summer camps and workshops", + ], + "Safety and Emergency Procedures": [ + "Child safety measures", + "Emergency contacts for children", + "First aid for kids", + "Preparing children for emergencies", + ], + "Product and Gear Recommendations": [ + "Baby gear and product advice", + "Reviews on children's products", + "Age-appropriate toys and books", + "Car seat and stroller recommendations", + ], + "Behavioral and Developmental Support": [ + "Child developmental milestones", + "Behavioral therapy options", + "Support for special needs", + "Coping with developmental disorders", + ], + "Family Support Services": [ + "Accessing family counseling", + "Support groups for parents", + "Financial aid for families", + "Legal advice for family issues", + ], + "Parental Leave and Work-Life Balance": [ + "Parental leave policies", + "Returning to work after childbirth", + "Work-life balance tips", + "Flexible work arrangements", + ], + "Community and Networking": [ + "Parenting groups and forums", + "Local family events", + "Connecting with other parents", + "Volunteering with children", + ], +} diff --git a/geniusrise_text/nli/intents/cultural.py b/geniusrise_text/nli/intents/cultural.py new file mode 100644 index 0000000..7edb5c0 --- /dev/null +++ b/geniusrise_text/nli/intents/cultural.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Local Cultural Events": [ + "Upcoming cultural festivals", + "Community art exhibitions", + "Cultural parades and celebrations", + "Local heritage events", + ], + "Performing Arts": [ + "Theater and drama performances", + "Dance recitals and shows", + "Opera and classical music concerts", + "Stand-up comedy and live shows", + ], + "Art Galleries and Museums": [ + "Current art exhibitions", + "Museum hours and ticketing", + "Guided tours and educational programs", + "Exhibit and artist information", + ], + "Workshops and Classes": [ + "Art and craft workshops", + "Cultural cooking classes", + "Music and instrument lessons", + "Creative writing and poetry sessions", + ], + "Historical Information": [ + "Local history inquiries", + "Historical landmarks and sites", + "Cultural heritage discussions", + "Historical reenactments and fairs", + ], + "Artist Support and Resources": [ + "Artist networking events", + "Grants and funding for artists", + "Art supply and studio resources", + "Showcasing and selling artwork", + ], + "Cultural Education": [ + "Cultural awareness programs", + "Educational resources on art and culture", + "Lectures and seminars on cultural topics", + "Intercultural exchange initiatives", + ], + "Film and Cinema": [ + "Film festival schedules", + "Independent film screenings", + "Cinema retrospectives", + "Film-making workshops", + ], + "Literature and Book Events": [ + "Author readings and book signings", + "Literary festivals and book fairs", + "Book club meetings", + "Poetry slams and spoken word events", + ], + "Music and Festivals": [ + "Music festival information", + "Live band and DJ events", + "Classical and folk music concerts", + "Music release and album launch events", + ], + "Public Art and Installations": [ + "Public art projects", + "Street art tours", + "Sculpture installations", + "Community art initiatives", + ], + "Fashion and Design": [ + "Fashion shows and exhibitions", + "Design and trend talks", + "Fashion design workshops", + "Textile and apparel exhibitions", + ], + "Cultural Preservation": [ + "Preservation of traditional arts", + "Cultural heritage protection", + "Documenting and archiving cultural practices", + "Advocacy for endangered art forms", + ], +} diff --git a/geniusrise_text/nli/intents/customer_support.py b/geniusrise_text/nli/intents/customer_support.py new file mode 100644 index 0000000..cbcf79f --- /dev/null +++ b/geniusrise_text/nli/intents/customer_support.py @@ -0,0 +1,100 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Report Issue": [ + "Technical problem", + "Service disruption", + "Product defect or malfunction", + ], + "Billing and Payments": [ + "Billing inquiry", + "Payment issue", + "Refund request", + "Invoice request", + ], + "Account Management": [ + "Account setup or update", + "Password reset", + "Subscription management", + "Account suspension or deactivation", + ], + "Order Management": [ + "Order status", + "Order modification", + "Cancel order", + "Shipping information", + ], + "Product Information": [ + "Product features and specifications", + "Product availability", + "Compatibility questions", + "Warranty information", + ], + "Service Information": [ + "Service plans and pricing", + "Service availability", + "Service limitations", + "Upgrade options", + ], + "Support and Assistance": [ + "Technical support", + "User guide request", + "Troubleshooting", + "Service setup assistance", + ], + "Feedback and Complaints": [ + "Submit feedback", + "Make a complaint", + "Service rating", + "Suggestion box", + ], + "Legal and Compliance": [ + "Privacy policy", + "Terms of service", + "Compliance query", + "Data security", + ], + "Emergency and Urgent Help": [ + "Reporting an urgent issue", + "Escalating a complaint", + "Immediate technical assistance", + ], + "Schedule Appointments": [ + "Book a service appointment", + "Reschedule a meeting", + "Appointment cancellations", + ], + "Promotions and Offers": [ + "Inquire about current promotions", + "Eligibility for special offers", + "Redeem an offer", + ], + "Loyalty Program": [ + "Loyalty program details", + "Points and rewards information", + "Membership benefits", + ], + "Accessibility Services": [ + "Request for special assistance", + "Accessibility features", + "Support for disabled users", + ], + "Contact Information": [ + "Request for contact details", + "Office locations", + "Opening hours", + ], +} diff --git a/geniusrise_text/nli/intents/diet.py b/geniusrise_text/nli/intents/diet.py new file mode 100644 index 0000000..618c292 --- /dev/null +++ b/geniusrise_text/nli/intents/diet.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Personalized Diet Plans": [ + "Creating a personalized diet plan", + "Dietary recommendations for specific goals", + "Adjusting diet plans for health conditions", + "Diet plans for weight loss or gain", + ], + "Nutritional Information": [ + "Nutrient content of foods", + "Understanding macronutrients and micronutrients", + "Reading and interpreting food labels", + "Information on vitamins and supplements", + ], + "Healthy Eating Habits": [ + "Tips for healthy eating", + "Meal planning and preparation", + "Healthy snack options", + "Balancing meals for optimal nutrition", + ], + "Dietary Restrictions and Allergies": [ + "Managing food allergies and intolerances", + "Gluten-free, dairy-free, and other special diets", + "Vegetarian and vegan dietary guidance", + "Navigating dietary restrictions", + ], + "Weight Management": [ + "Strategies for healthy weight management", + "Balancing calories and activity", + "Dealing with weight loss plateaus", + "Sustainable weight loss methods", + ], + "Sports Nutrition": [ + "Nutrition for athletes and active individuals", + "Pre- and post-workout meals", + "Hydration and sports performance", + "Supplements for athletic performance", + ], + "Family and Child Nutrition": [ + "Nutritional needs for children", + "Healthy eating for families", + "Dealing with picky eaters", + "Planning nutritious meals for kids", + ], + "Medical Nutrition Therapy": [ + "Dietary management of chronic diseases", + "Nutrition advice for specific medical conditions", + "Interactions between diet and medications", + "Post-surgical nutrition and recovery", + ], + "Nutrition Education and Resources": [ + "Educational materials on nutrition", + "Workshops and seminars on healthy eating", + "Online resources and tools", + "Nutrition courses and certifications", + ], + "Diet Trends and Fads": [ + "Information on current diet trends", + "Evaluating the effectiveness of fad diets", + "Risks and benefits of popular diets", + "Science-based perspective on diet trends", + ], + "Eating Disorders and Counseling": [ + "Support for eating disorders", + "Nutritional counseling for recovery", + "Body image and mental health", + "Referrals to eating disorder specialists", + ], + "Food and Cooking": [ + "Healthy cooking techniques", + "Recipe modification for health", + "Cooking for specific dietary needs", + "Meal ideas and recipe suggestions", + ], +} diff --git a/geniusrise_text/nli/intents/ecommerce.py b/geniusrise_text/nli/intents/ecommerce.py new file mode 100644 index 0000000..97b09fb --- /dev/null +++ b/geniusrise_text/nli/intents/ecommerce.py @@ -0,0 +1,83 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Product Queries": [ + "Find product information", + "Check product availability", + "Compare products", + "Product specifications", + ], + "Order Management": [ + "Place an order", + "Cancel an order", + "Modify an order", + "Order status inquiry", + ], + "Shipping and Delivery": [ + "Track shipment", + "Change shipping address", + "Shipping options", + "Delivery time estimation", + ], + "Returns and Exchanges": [ + "Return policy", + "Start a return", + "Exchange an item", + "Refund status", + ], + "Payment and Pricing": [ + "Payment methods", + "Apply a promo code", + "Pricing inquiry", + "Payment issues", + ], + "Account and Profile Management": [ + "Create an account", + "Update account information", + "Reset password", + "Manage payment options", + ], + "Customer Support": [ + "Contact customer service", + "Feedback and complaints", + "Get technical support", + "Store locations and hours", + ], + "Recommendations and Offers": [ + "Product recommendations", + "Current deals and promotions", + "Gift card inquiries", + "Loyalty program information", + ], + "Size and Fit Guidance": [ + "Size charts", + "Product fitting", + "Material and care", + "Size recommendations", + ], + "Security and Privacy": [ + "Account security", + "Privacy policy", + "Data protection", + "Report a security issue", + ], + "Accessibility Services": [ + "Accessibility features", + "Request assistance", + "Navigating the website", + "Using the app", + ], +} diff --git a/geniusrise_text/nli/intents/education.py b/geniusrise_text/nli/intents/education.py new file mode 100644 index 0000000..5c3011a --- /dev/null +++ b/geniusrise_text/nli/intents/education.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Course Information": [ + "Course offerings", + "Course schedule and timings", + "Credit and grading system", + "Prerequisites for courses", + ], + "Enrollment and Registration": [ + "Enrollment process", + "Class registration", + "Waitlist inquiries", + "Dropping a course", + ], + "Academic Guidance": [ + "Major and minor selection", + "Academic advising", + "Study abroad options", + "Internship opportunities", + ], + "Examinations and Assessments": [ + "Exam schedule", + "Assignment submissions", + "Grading queries", + "Exam preparation resources", + ], + "Tuition and Financial Aid": [ + "Tuition fees", + "Scholarships and grants", + "Student loan information", + "Payment plans", + ], + "Student Services": [ + "Counseling services", + "Career center", + "Health services", + "Student clubs and organizations", + ], + "Technical Support": [ + "Learning management system help", + "Technical issues with online resources", + "Accessing digital libraries", + "Software and hardware resources", + ], + "Faculty and Staff Information": [ + "Contacting faculty", + "Office hours", + "Research and publications", + "Staff directory", + ], + "Campus Facilities": [ + "Campus amenities", + "Library services", + "Sports facilities", + "Accommodation and housing", + ], + "Policies and Regulations": [ + "Academic integrity policy", + "Attendance rules", + "Student conduct code", + "Complaint and grievance procedures", + ], + "Alumni Services": [ + "Alumni events and reunions", + "Networking opportunities", + "Continuing education", + "Alumni benefits and services", + ], + "Remote and Online Learning": [ + "E-learning resources", + "Remote class participation", + "Online course access", + "Blended learning support", + ], +} diff --git a/geniusrise_text/nli/intents/employee_helpdesk.py b/geniusrise_text/nli/intents/employee_helpdesk.py new file mode 100644 index 0000000..43587d0 --- /dev/null +++ b/geniusrise_text/nli/intents/employee_helpdesk.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "IT Support": [ + "Computer or hardware issues", + "Software installation and updates", + "Network connectivity problems", + "Access to digital tools and resources", + ], + "HR Inquiries": [ + "Leave policy and requests", + "Benefits and compensation queries", + "Employee wellness programs", + "Performance review process", + ], + "Facilities Management": [ + "Workspace maintenance requests", + "Meeting room bookings", + "Parking and transportation services", + "Health and safety concerns", + ], + "Finance and Expense": [ + "Expense report submission", + "Payroll inquiries", + "Budget allocation questions", + "Procurement process", + ], + "Training and Development": [ + "Professional development opportunities", + "Training program schedules", + "Certification and learning resources", + "Mentorship and coaching programs", + ], + "Project Management": [ + "Project collaboration tools", + "Deadline extensions and modifications", + "Resource allocation", + "Project status updates", + ], + "Travel and Accommodation": [ + "Business travel arrangements", + "Travel policy and reimbursements", + "Accommodation bookings", + "Visa and travel documentation", + ], + "Legal and Compliance": [ + "Contract review requests", + "Data privacy and security policies", + "Compliance training and certifications", + "Legal consultation and support", + ], + "Communications and Collaboration": [ + "Internal communication platforms", + "Collaboration tools and access", + "Team meeting coordination", + "Cross-departmental initiatives", + ], + "Employee Feedback and Suggestions": [ + "Employee satisfaction surveys", + "Feedback submission channels", + "Suggestion box for improvements", + "Employee engagement activities", + ], + "Onboarding and Offboarding": [ + "New employee onboarding process", + "Offboarding procedures", + "Orientation schedules", + "Transition support", + ], + "Administrative Assistance": [ + "Document and record-keeping", + "Scheduling and calendar management", + "Courier and mailing services", + "Administrative support requests", + ], +} diff --git a/geniusrise_text/nli/intents/environment.py b/geniusrise_text/nli/intents/environment.py new file mode 100644 index 0000000..7f2729d --- /dev/null +++ b/geniusrise_text/nli/intents/environment.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Recycling and Waste Management": [ + "Recycling guidelines", + "Waste disposal facilities", + "Hazardous waste handling", + "Composting techniques", + ], + "Conservation Efforts": [ + "Wildlife conservation initiatives", + "Endangered species information", + "Habitat protection programs", + "Volunteer opportunities for conservation", + ], + "Sustainable Living": [ + "Sustainable lifestyle tips", + "Eco-friendly products", + "Energy-saving practices", + "Sustainable transportation options", + ], + "Climate Change Information": [ + "Climate change impacts", + "Carbon footprint reduction", + "Renewable energy sources", + "Global warming prevention measures", + ], + "Environmental Policies": [ + "Government environmental regulations", + "Policy advocacy and lobbying", + "Community environmental projects", + "Environmental law and compliance", + ], + "Water Conservation": [ + "Water saving techniques", + "Drought-resistant landscaping", + "Water pollution prevention", + "Rainwater harvesting methods", + ], + "Air Quality": [ + "Air pollution monitoring", + "Health impacts of poor air quality", + "Emission control measures", + "Indoor air quality improvement", + ], + "Biodiversity": [ + "Ecosystem diversity importance", + "Threats to biodiversity", + "Biodiversity in urban areas", + "Promoting biodiversity in gardens", + ], + "Environmental Education": [ + "Environmental courses and programs", + "Educational materials on sustainability", + "Schools and universities with environmental focus", + "Workshops and seminars on environmental topics", + ], + "Green Spaces and Urban Planning": [ + "Development of green spaces", + "Urban planning for sustainability", + "Community gardens", + "Green building practices", + ], + "Environmental Research": [ + "Current environmental studies", + "Scientific research on ecosystems", + "Funding for environmental research", + "Participating in research projects", + ], + "Eco-Tourism": [ + "Eco-friendly travel destinations", + "Impact of tourism on environment", + "Eco-tourism activities", + "Supporting conservation through tourism", + ], + "Renewable Energy": [ + "Solar, wind, and hydro power", + "Renewable energy incentives", + "Installing renewable energy systems", + "Renewable energy in community planning", + ], +} diff --git a/geniusrise_text/nli/intents/esoteric.py b/geniusrise_text/nli/intents/esoteric.py new file mode 100644 index 0000000..15df12d --- /dev/null +++ b/geniusrise_text/nli/intents/esoteric.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Astrological Readings": [ + "Personal birth chart analysis", + "Daily, weekly, and monthly horoscopes", + "Relationship and compatibility readings", + "Career and financial astrology", + ], + "Tarot and Divination": [ + "Tarot card readings", + "I Ching consultations", + "Runes and other divination methods", + "Psychic and intuitive readings", + ], + "Spiritual Guidance and Counseling": [ + "Spiritual life coaching", + "Energy healing sessions", + "Meditation and mindfulness guidance", + "Past life regressions and soul journeys", + ], + "Workshops and Classes": [ + "Astrology and tarot classes", + "Workshops on spiritual development", + "Esoteric practices training", + "Online courses and webinars", + ], + "Numerology Services": [ + "Numerology chart readings", + "Name and date analysis", + "Life path and destiny numbers", + "Business and personal numerology", + ], + "Metaphysical Products": [ + "Crystals and gemstones", + "Energy cleansing tools", + "Metaphysical books and resources", + "Customized talismans and amulets", + ], + "Feng Shui and Vaastu": [ + "Feng Shui consultations for home and office", + "Vaastu Shastra advice", + "Energy balancing and space clearing", + "Geomancy and environmental harmony", + ], + "Aura and Chakra Services": [ + "Aura readings and photography", + "Chakra balancing and alignment", + "Energy field assessments", + "Color therapy and healing", + ], + "Event and Date Planning": [ + "Astrology for selecting auspicious dates", + "Event planning based on astrological timings", + "Wedding and celebration date selection", + "Business launch and contract signing dates", + ], + "Dream Interpretation": [ + "Analysis of dreams and symbols", + "Guidance on recurring dreams", + "Connecting dreams to personal life", + "Lucid dreaming techniques", + ], + "Occult and Mystical Practices": [ + "Occult philosophy and teachings", + "Mystical and magical practices", + "Secret societies and traditions", + "Ancient and sacred texts", + ], + "Personal Development and Growth": [ + "Using astrology for self-improvement", + "Esoteric approaches to personal growth", + "Overcoming spiritual blocks", + "Integrating spirituality into daily life", + ], +} diff --git a/geniusrise_text/nli/intents/events.py b/geniusrise_text/nli/intents/events.py new file mode 100644 index 0000000..07beddb --- /dev/null +++ b/geniusrise_text/nli/intents/events.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Event Information": [ + "Upcoming events schedule", + "Event timings and duration", + "Age restrictions and advisories", + "Special event announcements", + ], + "Ticketing and Reservations": [ + "Ticket availability and booking", + "Group ticket reservations", + "Ticket cancellation and refunds", + "VIP and premium seating inquiries", + ], + "Venue Information": [ + "Venue directions and parking", + "Seating arrangements", + "Accessibility services at venue", + "Venue rental information", + ], + "Performance Details": [ + "Artist or performer line-up", + "Performance setlist", + "Meet and greet opportunities", + "Special performances and acts", + ], + "Event Planning and Coordination": [ + "Event planning services", + "Catering and banquet services", + "Audio/visual equipment setup", + "Decor and theme setup", + ], + "Promotions and Discounts": [ + "Current promotions and offers", + "Discounts for groups or members", + "Loyalty program benefits", + "Early bird special offers", + ], + "Safety and Regulations": [ + "Event safety measures", + "Prohibited items and policies", + "Emergency procedures", + "COVID-19 related guidelines", + ], + "Private and Corporate Events": [ + "Corporate event planning", + "Private party arrangements", + "Team building event queries", + "Custom event packages", + ], + "Cultural and Community Events": [ + "Cultural festival information", + "Community gatherings and events", + "Local heritage events", + "Charity and fundraising events", + ], + "Feedback and Complaints": [ + "Submit feedback on an event", + "Report an issue or complaint", + "Service quality feedback", + "Suggestions for improvement", + ], + "Event Merchandise": [ + "Merchandise availability", + "Souvenir and gift options", + "Custom merchandise orders", + "Merchandise pickup and delivery", + ], + "Special Requests and Services": [ + "Special guest accommodations", + "Birthday and anniversary services", + "Special dietary requirements", + "Photography and recording services", + ], +} diff --git a/geniusrise_text/nli/intents/fitness.py b/geniusrise_text/nli/intents/fitness.py new file mode 100644 index 0000000..7227c06 --- /dev/null +++ b/geniusrise_text/nli/intents/fitness.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Membership Information": [ + "Joining and membership options", + "Membership benefits and perks", + "Membership renewal and cancellation", + "Family and group membership plans", + ], + "Facilities and Equipment": [ + "Available equipment and facilities", + "Operating hours", + "Maintenance and cleanliness", + "Accessibility and special accommodations", + ], + "Fitness Programs and Classes": [ + "Group fitness class schedule", + "Personal training services", + "Specialized fitness programs", + "Online and virtual fitness options", + ], + "Sports Activities and Coaching": [ + "Sports team registration", + "Coaching and training sessions", + "Youth sports programs", + "Tournament and competition information", + ], + "Health and Wellness": [ + "Nutrition and diet consultation", + "Wellness and health check-ups", + "Recovery and rehabilitation services", + "Mindfulness and yoga classes", + ], + "Safety and Hygiene Protocols": [ + "COVID-19 safety measures", + "First aid and emergency procedures", + "Hygiene and sanitation practices", + "Safety guidelines for equipment use", + ], + "Event and Social Activities": [ + "Club events and social gatherings", + "Member meetups and networking", + "Special events and guest appearances", + "Community outreach and charity events", + ], + "Feedback and Suggestions": [ + "Member feedback and reviews", + "Suggestion box for improvements", + "Complaint resolution process", + "Success stories and testimonials", + ], + "Pricing and Payment Options": [ + "Membership fees and pricing", + "Payment plans and methods", + "Discounts and promotional offers", + "Billing and invoice inquiries", + ], + "Youth and Family Programs": [ + "Family-friendly fitness activities", + "Children's sports and fitness classes", + "Parent-child fitness sessions", + "Youth development programs", + ], + "Apparel and Merchandise": [ + "Club-branded apparel and gear", + "Merchandise purchase and availability", + "Equipment rental services", + "Special sales and merchandise offers", + ], + "Facility Rentals and Private Events": [ + "Private event hosting", + "Facility rental options and rates", + "Organizing sports events", + "Corporate and group event services", + ], +} diff --git a/geniusrise_text/nli/intents/food_ordering.py b/geniusrise_text/nli/intents/food_ordering.py new file mode 100644 index 0000000..35b57f6 --- /dev/null +++ b/geniusrise_text/nli/intents/food_ordering.py @@ -0,0 +1,83 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Menu Inquiry": [ + "View menu items", + "Specialty dishes", + "Vegan and vegetarian options", + "Allergen information", + ], + "Placing Orders": [ + "Order food for delivery", + "Order food for pickup", + "Add special instructions", + "Repeat last order", + ], + "Order Modification": [ + "Add items to order", + "Remove items from order", + "Change order time", + "Update delivery address", + ], + "Payment and Billing": [ + "Payment methods available", + "Apply discount code", + "Split bill options", + "Payment issue resolution", + ], + "Order Tracking": [ + "Track delivery status", + "Estimated time of arrival", + "Contact delivery driver", + "Report missing or wrong items", + ], + "Feedback and Ratings": [ + "Rate food quality", + "Leave feedback for service", + "Report an issue with order", + "Compliment a staff member", + ], + "Reservations and Bookings": [ + "Book a table", + "Reservation changes", + "Group booking inquiry", + "Reservation cancellation", + ], + "Promotions and Offers": [ + "Current promotions", + "Loyalty program details", + "First-time order discounts", + "Referral bonuses", + ], + "Health and Safety": [ + "COVID-19 measures", + "Food handling practices", + "Contactless delivery options", + "Hygiene certifications", + ], + "Customer Support": [ + "Get help with order", + "Contact customer service", + "Report a complaint", + "Locate a nearby outlet", + ], + "Catering Services": [ + "Catering options", + "Catering for events", + "Custom catering menu", + "Catering order modifications", + ], +} diff --git a/geniusrise_text/nli/intents/freelancing.py b/geniusrise_text/nli/intents/freelancing.py new file mode 100644 index 0000000..c0107aa --- /dev/null +++ b/geniusrise_text/nli/intents/freelancing.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Finding Freelance Work": [ + "Platforms for freelance job listings", + "Tips for securing freelance projects", + "Freelancing opportunities in specific industries", + "Building a freelance portfolio", + ], + "Gig Economy Platforms": [ + "Signing up for gig economy apps", + "Comparing different gig platforms", + "Maximizing earnings in the gig economy", + "Gig work in various sectors like transport or delivery", + ], + "Freelancer Tools and Resources": [ + "Time management and productivity tools", + "Invoice and payment processing software", + "Project management and collaboration tools", + "Resources for market research and client acquisition", + ], + "Legal and Tax Advice": [ + "Contract and agreement guidance", + "Navigating taxes for freelancers", + "Setting up a freelance business", + "Legal resources for freelancers", + ], + "Pricing and Negotiation": [ + "Setting freelance rates and prices", + "Negotiating contracts and terms", + "Managing client expectations", + "Tips for effective pricing strategies", + ], + "Networking and Community": [ + "Freelancer networking events", + "Online communities for gig workers", + "Mentorship and support groups", + "Collaborations and partnerships", + ], + "Work-Life Balance": [ + "Managing work-life balance as a freelancer", + "Dealing with burnout", + "Time off and vacation for freelancers", + "Setting boundaries with clients", + ], + "Skill Development and Training": [ + "Courses and training for freelancers", + "Upgrading skills for the gig economy", + "Certifications and qualifications", + "Learning new tools and technologies", + ], + "Client Management": [ + "Maintaining client relationships", + "Dealing with difficult clients", + "Client communication best practices", + "Feedback and review management", + ], + "Insurance and Benefits": [ + "Insurance options for freelancers", + "Retirement planning for gig workers", + "Health insurance and benefits", + "Risk management in freelancing", + ], + "Remote Work and Digital Nomadism": [ + "Tips for remote and nomadic working", + "Tools for digital nomads", + "Best locations for remote working", + "Balancing travel and work", + ], + "Financial Planning and Security": [ + "Financial planning for freelancers", + "Emergency funds and savings", + "Investment strategies for gig workers", + "Income diversification", + ], +} diff --git a/geniusrise_text/nli/intents/gardening.py b/geniusrise_text/nli/intents/gardening.py new file mode 100644 index 0000000..f298f91 --- /dev/null +++ b/geniusrise_text/nli/intents/gardening.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Plant Care and Maintenance": [ + "Watering and feeding schedules", + "Pruning and trimming techniques", + "Pest and disease management", + "Seasonal plant care tips", + ], + "Garden Design and Landscaping": [ + "Landscape design ideas", + "Choosing plants for garden themes", + "Hardscaping and garden structures", + "Sustainable and eco-friendly gardening", + ], + "Indoor and Urban Gardening": [ + "Houseplant care and selection", + "Setting up an indoor garden", + "Balcony and rooftop gardening", + "Container and small space gardening", + ], + "Soil and Fertilization": [ + "Soil testing and improvement", + "Organic and inorganic fertilizers", + "Composting methods", + "Mulching techniques", + ], + "Irrigation and Watering Systems": [ + "Irrigation system installation", + "Water conservation in gardening", + "Drip irrigation and soaker hoses", + "Automated watering solutions", + ], + "Vegetable and Herb Gardening": [ + "Growing vegetables at home", + "Herb garden setup and care", + "Edible plants for gardens", + "Seasonal vegetable gardening", + ], + "Flower Gardening": [ + "Annual and perennial flowers", + "Bulb planting and care", + "Flower garden design", + "Attracting pollinators", + ], + "Trees and Shrubs Care": [ + "Tree planting and care", + "Shrub selection and pruning", + "Fruit tree maintenance", + "Hedge and border plants", + ], + "Garden Pests and Diseases": [ + "Identifying and controlling pests", + "Dealing with plant diseases", + "Organic pest control methods", + "Beneficial insects and natural predators", + ], + "Workshops and Educational Programs": [ + "Gardening workshops and classes", + "Horticulture training programs", + "Gardening for beginners", + "Specialized gardening techniques", + ], + "Tools and Equipment": [ + "Gardening tool recommendations", + "Equipment maintenance and repair", + "New gardening technologies", + "Renting vs. buying equipment", + ], + "Sustainability and Eco-Gardening": [ + "Creating a wildlife-friendly garden", + "Native plants and biodiversity", + "Rain gardens and stormwater management", + "Organic gardening practices", + ], + "Consultation and Professional Services": [ + "Landscape consultation", + "Garden planning and design services", + "Professional gardening services", + "Horticultural therapy", + ], +} diff --git a/geniusrise_text/nli/intents/government.py b/geniusrise_text/nli/intents/government.py new file mode 100644 index 0000000..604d36b --- /dev/null +++ b/geniusrise_text/nli/intents/government.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Public Services and Utilities": [ + "Waste collection schedule", + "Water and utility services", + "Public transportation information", + "Street maintenance and cleaning", + ], + "Permits and Licensing": [ + "Building and construction permits", + "Business licensing information", + "Parking permits", + "Event and public gathering permits", + ], + "Local Policies and Regulations": [ + "Zoning laws and regulations", + "Noise ordinances", + "Pet and animal regulations", + "Environmental and health codes", + ], + "Community Programs and Events": [ + "Community development programs", + "Local events and festivals", + "Recreational activities and classes", + "Volunteer opportunities", + ], + "Health and Safety": [ + "Public health resources", + "Emergency preparedness", + "Safety campaigns and initiatives", + "First responders and emergency services", + ], + "Voting and Elections": [ + "Voter registration information", + "Election dates and polling places", + "Candidate and ballot information", + "Election results and updates", + ], + "Housing and Property": [ + "Affordable housing programs", + "Property tax information", + "Homeowner resources", + "Landlord and tenant resources", + ], + "Public Works and Infrastructure": [ + "Infrastructure projects", + "Traffic and roadwork updates", + "Public facility improvements", + "Utility upgrades and installations", + ], + "Economic Development": [ + "Business development resources", + "Economic incentives and grants", + "Local job opportunities", + "Small business support", + ], + "Education and Schools": [ + "Local school information", + "Educational resources", + "School enrollment and policies", + "Youth and adult education programs", + ], + "Environmental Conservation": [ + "Conservation initiatives", + "Recycling and sustainability programs", + "Green space and park maintenance", + "Environmental impact assessments", + ], + "Public Feedback and Inquiries": [ + "Citizen feedback and suggestions", + "Public records requests", + "Complaint and issue reporting", + "Public information inquiries", + ], + "Social Services": [ + "Social welfare programs", + "Support for vulnerable populations", + "Community outreach services", + "Mental health and counseling services", + ], +} diff --git a/geniusrise_text/nli/intents/healthcare.py b/geniusrise_text/nli/intents/healthcare.py new file mode 100644 index 0000000..3806e75 --- /dev/null +++ b/geniusrise_text/nli/intents/healthcare.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Appointment Scheduling": [ + "Schedule a doctor's appointment", + "Change appointment date", + "Cancel an appointment", + "Specialist appointment request", + ], + "Medical Information and Advice": [ + "Symptom check", + "Medication inquiries", + "Treatment options", + "Health and wellness advice", + ], + "Billing and Insurance": [ + "Billing inquiries", + "Insurance coverage questions", + "Payment plans and options", + "Understanding medical charges", + ], + "Prescriptions and Refills": [ + "Request prescription refill", + "Prescription transfer", + "Medication side effects", + "Over-the-counter alternatives", + ], + "Test Results and Reports": [ + "Lab test results", + "Radiology report inquiries", + "Understanding test reports", + "Follow-up on pending results", + ], + "Emergency Services": [ + "Emergency care information", + "Urgent care wait times", + "Reporting a medical emergency", + "After-hours medical assistance", + ], + "Patient Records and Documentation": [ + "Accessing medical records", + "Updating personal information", + "Release of medical information", + "Privacy and confidentiality concerns", + ], + "Healthcare Facilities Information": [ + "Hospital services and facilities", + "Clinic locations and hours", + "Specialty care centers", + "Accessibility services", + ], + "Preventive Care and Screenings": [ + "Routine check-up schedule", + "Vaccination information", + "Health screening programs", + "Preventive health measures", + ], + "Telehealth and Online Consultations": [ + "Setting up a telehealth visit", + "Technical support for virtual visits", + "Eligibility for online consultations", + "Follow-up after a virtual appointment", + ], + "Patient Support Services": [ + "Support groups", + "Patient advocacy services", + "Nutrition and lifestyle counseling", + "Mental health resources", + ], + "Hospital Admission and Discharge": [ + "Admission procedures", + "Discharge process", + "Post-discharge care", + "Hospital stay information", + ], +} diff --git a/geniusrise_text/nli/intents/hobby.py b/geniusrise_text/nli/intents/hobby.py new file mode 100644 index 0000000..65d4ddd --- /dev/null +++ b/geniusrise_text/nli/intents/hobby.py @@ -0,0 +1,107 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Arts and Crafts": [ + "Art classes and workshops", + "Crafting techniques and tips", + "Materials and supplies sourcing", + "Exhibitions and art shows", + ], + "Music and Performing Arts": [ + "Music lessons and tutorials", + "Local concerts and performances", + "Instrument repair and maintenance", + "Community theater groups", + ], + "Gardening and Horticulture": [ + "Gardening advice and tips", + "Plant care and cultivation", + "Landscaping ideas", + "Gardening clubs and societies", + ], + "Sports and Fitness": [ + "Local sports teams", + "Fitness training groups", + "Outdoor adventure activities", + "Sports events and competitions", + ], + "Technology and Gadgets": [ + "Tech workshops and meetups", + "Latest gadgets and innovations", + "DIY tech projects", + "Technology fairs and expos", + ], + "Cooking and Culinary Arts": [ + "Cooking classes", + "Recipe exchange", + "Wine and food tasting events", + "Culinary tours and experiences", + ], + "Photography and Videography": [ + "Photography courses", + "Photo walks and excursions", + "Camera and equipment advice", + "Photography exhibitions", + ], + "Reading and Literature": [ + "Book clubs and reading groups", + "Author readings and signings", + "Literary discussions", + "Writing workshops", + ], + "Travel and Exploration": [ + "Travel planning meetups", + "Cultural exchange programs", + "Adventure travel groups", + "Travel storytelling sessions", + ], + "Fashion and Beauty": [ + "Fashion design workshops", + "Makeup and beauty tutorials", + "Fashion shows and events", + "Beauty product swaps", + ], + "Board Games and Puzzles": [ + "Board game nights", + "Puzzle competitions", + "Strategy gaming groups", + "Game development workshops", + ], + "Collectibles and Memorabilia": [ + "Collector meetups", + "Trade and exchange events", + "Preservation and display tips", + "Collectors' fairs and exhibitions", + ], + "Animal and Pet Enthusiasts": [ + "Pet care meetups", + "Animal rescue volunteer groups", + "Wildlife appreciation events", + "Pet training and behavior sessions", + ], + "Astronomy and Space Exploration": [ + "Stargazing events", + "Astronomy clubs", + "Space science workshops", + "Observatory visits", + ], + "Environmental and Sustainability": [ + "Eco-friendly living discussions", + "Sustainability projects", + "Environmental activism groups", + "Conservation volunteering", + ], +} diff --git a/geniusrise_text/nli/intents/library.py b/geniusrise_text/nli/intents/library.py new file mode 100644 index 0000000..cfe33fb --- /dev/null +++ b/geniusrise_text/nli/intents/library.py @@ -0,0 +1,83 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Book and Resource Inquiries": [ + "Book availability", + "New arrivals and releases", + "Location of specific genres or topics", + "E-book and audiobook options", + ], + "Account Management": [ + "Library card registration", + "Account login issues", + "Renewing library memberships", + "Updating personal information", + ], + "Borrowing and Returns": [ + "Borrowing procedures", + "Renewal of borrowed items", + "Return deadlines", + "Fines and fees for late returns", + ], + "Research Assistance": [ + "Reference material searches", + "Assistance with research projects", + "Accessing academic journals", + "Citation and bibliography help", + ], + "Event and Program Information": [ + "Library events schedule", + "Reading groups and clubs", + "Educational programs and workshops", + "Children's storytime sessions", + ], + "Technology and Internet Access": [ + "Computer and internet use", + "Printing and photocopying services", + "Accessing online databases", + "Wi-Fi connectivity issues", + ], + "Facilities and Accessibility": [ + "Library opening hours", + "Accessibility services", + "Study room bookings", + "Facility amenities and layout", + ], + "Donations and Volunteering": [ + "Book and resource donations", + "Volunteer opportunities", + "Fundraising events", + "Supporting the library", + ], + "Special Collections and Archives": [ + "Accessing special collections", + "Historical records and archives", + "Rare books and manuscripts", + "Preservation and conservation queries", + ], + "Community Outreach": [ + "Library outreach programs", + "Partnerships with schools and organizations", + "Community resource sharing", + "Public information services", + ], + "Feedback and Suggestions": [ + "Providing feedback on services", + "Book and resource recommendations", + "Suggesting improvements", + "Complaint resolution", + ], +} diff --git a/geniusrise_text/nli/intents/news.py b/geniusrise_text/nli/intents/news.py new file mode 100644 index 0000000..cd51363 --- /dev/null +++ b/geniusrise_text/nli/intents/news.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Breaking News": [ + "Latest headlines", + "Breaking news updates", + "Live news coverage", + "Emergency news alerts", + ], + "Local News": [ + "Community news", + "Local government updates", + "Regional events and festivals", + "Local weather reports", + ], + "International News": [ + "Global news updates", + "International politics", + "World economic news", + "Overseas conflicts and crises", + ], + "Sports News": [ + "Sports scores and results", + "Upcoming sports events", + "Team and athlete news", + "Sports analysis and commentary", + ], + "Entertainment News": [ + "Celebrity news and gossip", + "Movie and TV show updates", + "Music and concert news", + "Arts and culture events", + ], + "Business and Finance": [ + "Stock market updates", + "Business news", + "Economic reports", + "Industry trends", + ], + "Science and Technology": [ + "Latest technology news", + "Scientific discoveries", + "Tech product launches", + "Research and development news", + ], + "Health and Wellness": [ + "Health news", + "Medical breakthroughs", + "Wellness tips", + "Public health updates", + ], + "Travel and Lifestyle": [ + "Travel news", + "Destination highlights", + "Lifestyle trends", + "Cultural insights", + ], + "Editorial and Opinion": [ + "Editorial pieces", + "Opinion columns", + "Expert analysis", + "Reader letters and comments", + ], + "Weather Forecast": [ + "Daily weather updates", + "Severe weather alerts", + "Long-term forecasts", + "Climate change news", + ], + "Special Reports": [ + "In-depth investigations", + "Documentaries", + "Special coverage topics", + "Feature stories", + ], + "User Queries and Requests": [ + "News source inquiries", + "Article requests", + "News archive access", + "Subscription and access issues", + ], +} diff --git a/geniusrise_text/nli/intents/non_profit.py b/geniusrise_text/nli/intents/non_profit.py new file mode 100644 index 0000000..8d57873 --- /dev/null +++ b/geniusrise_text/nli/intents/non_profit.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Donation Information": [ + "How to donate", + "Types of donations accepted", + "Donation tax receipts", + "Recurring donation setups", + ], + "Volunteer Opportunities": [ + "Current volunteer needs", + "Volunteer registration process", + "Youth and group volunteering", + "Volunteer training and orientation", + ], + "Programs and Services": [ + "Community support programs", + "Educational and outreach initiatives", + "Services for vulnerable populations", + "Environmental and sustainability projects", + ], + "Event and Fundraising": [ + "Upcoming charity events", + "Fundraising campaign information", + "Participation and sponsorship", + "Event volunteer roles", + ], + "Advocacy and Awareness": [ + "Current advocacy campaigns", + "Raising awareness for causes", + "Community and political engagement", + "Educational resources and materials", + ], + "Partnerships and Collaborations": [ + "Partnership opportunities", + "Corporate and business collaborations", + "Community and government partnerships", + "International collaboration initiatives", + ], + "Membership and Support": [ + "Becoming a member", + "Membership benefits", + "Supporter and donor acknowledgment", + "Membership renewal and updates", + ], + "Resource and Aid Distribution": [ + "Aid and relief services", + "Resource distribution programs", + "Emergency and disaster response", + "Support for underserved areas", + ], + "Research and Development": [ + "Research projects and findings", + "Development of new solutions", + "Collaborative research opportunities", + "Grant and funding information for research", + ], + "Outreach and Education": [ + "Community outreach programs", + "Educational workshops and seminars", + "School and youth engagement", + "Public education campaigns", + ], + "Feedback and Inquiries": [ + "Providing feedback and suggestions", + "Inquiry responses and information", + "Handling complaints and concerns", + "Testimonials and stories", + ], + "Legal and Compliance": [ + "Regulatory and legal compliance", + "Non-profit governance", + "Ethical guidelines and standards", + "Transparency and accountability reports", + ], +} diff --git a/geniusrise_text/nli/intents/performing_arts.py b/geniusrise_text/nli/intents/performing_arts.py new file mode 100644 index 0000000..33124ee --- /dev/null +++ b/geniusrise_text/nli/intents/performing_arts.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Event Schedules and Bookings": [ + "Concert and performance schedules", + "Ticket booking and reservations", + "Event cancellations and rescheduling", + "Group and VIP booking inquiries", + ], + "Artist and Performer Information": [ + "Biographies of musicians and performers", + "Upcoming shows of specific artists", + "Meet and greet events", + "Artist merchandise and recordings", + ], + "Classes and Workshops": [ + "Music lessons and workshops", + "Dance class schedules", + "Theater and acting workshops", + "Instrument training programs", + ], + "Venue Information": [ + "Venue facilities and seating", + "Accessibility and special accommodations", + "Parking and transportation options", + "Venue rental for events", + ], + "Community and Youth Programs": [ + "Youth orchestras and choirs", + "Community theater productions", + "Educational outreach programs", + "Volunteer opportunities in the arts", + ], + "Auditions and Casting Calls": [ + "Audition schedules and requirements", + "Casting calls for performances", + "Audition preparation tips", + "Performance opportunities for amateurs", + ], + "Festivals and Special Events": [ + "Music and arts festival information", + "Cultural and heritage events", + "Special holiday performances", + "Outdoor concerts and events", + ], + "Grants and Funding": [ + "Funding for arts projects", + "Scholarships for performers", + "Grant application procedures", + "Sponsorship opportunities", + ], + "Technical and Production Support": [ + "Stage and lighting setup", + "Sound and audio equipment", + "Backstage and crew information", + "Production and technical training", + ], + "Promotion and Marketing": [ + "Promoting arts events", + "Marketing services for performers", + "Social media promotion", + "Advertising in performing arts", + ], + "Reviews and Critiques": [ + "Performance reviews and feedback", + "Critique sessions", + "Audience reviews and testimonials", + "Professional assessment and advice", + ], + "Collaborations and Partnerships": [ + "Collaborative projects", + "Partnerships with other arts organizations", + "Cross-disciplinary performances", + "Community and business partnerships", + ], + "Health and Wellness for Artists": [ + "Artist health and wellness programs", + "Performance anxiety and stress management", + "Physical therapy for performers", + "Mental health resources", + ], +} diff --git a/geniusrise_text/nli/intents/personal_finance.py b/geniusrise_text/nli/intents/personal_finance.py new file mode 100644 index 0000000..8bc480c --- /dev/null +++ b/geniusrise_text/nli/intents/personal_finance.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Budgeting and Expense Tracking": [ + "Setting up a personal budget", + "Tools for tracking expenses", + "Tips for reducing monthly expenses", + "Budgeting for irregular income", + ], + "Saving and Investment Strategies": [ + "Advice on saving money effectively", + "Different types of investment options", + "Risk assessment in investments", + "Long-term vs. short-term investments", + ], + "Debt Management and Reduction": [ + "Strategies for paying off debt", + "Consolidating multiple debts", + "Dealing with credit card debt", + "Negotiating debt settlements", + ], + "Credit Score and Reports": [ + "Understanding credit scores", + "Improving and repairing credit", + "Disputing errors in credit reports", + "Credit monitoring services", + ], + "Retirement Planning": [ + "Retirement savings plans", + "Calculating retirement needs", + "Pension plans and social security", + "Early retirement strategies", + ], + "Tax Planning and Filing": [ + "Tax saving tips", + "Filing taxes and deadlines", + "Tax deductions and credits", + "Handling tax disputes", + ], + "Insurance and Protection": [ + "Choosing the right insurance plans", + "Life, health, and property insurance", + "Insurance claim processes", + "Assessing insurance coverage needs", + ], + "Financial Emergencies and Safety Nets": [ + "Building an emergency fund", + "Handling financial crises", + "Insurance as a safety net", + "Preparing for unexpected expenses", + ], + "Home Buying and Mortgages": [ + "Saving for a home purchase", + "Understanding mortgage options", + "First-time homebuyer advice", + "Refinancing a mortgage", + ], + "Education and Student Finance": [ + "Saving for education", + "Student loans and grants", + "Budgeting for college students", + "Paying off student debt", + ], + "Family and Child Finance": [ + "Financial planning for families", + "Teaching kids about money", + "Budgeting for new parents", + "Saving for children's future", + ], + "Financial Literacy and Education": [ + "Basic financial education resources", + "Workshops and seminars on finance", + "Online courses in personal finance", + "Financial advice books and blogs", + ], + "Entrepreneurship and Business Finance": [ + "Financial planning for startups", + "Managing business finances", + "Funding and investment for businesses", + "Budgeting for small businesses", + ], +} diff --git a/geniusrise_text/nli/intents/pet_care.py b/geniusrise_text/nli/intents/pet_care.py new file mode 100644 index 0000000..e1fe7a6 --- /dev/null +++ b/geniusrise_text/nli/intents/pet_care.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Veterinary Services": [ + "Scheduling vet appointments", + "Emergency pet care", + "Vaccination schedules", + "Spaying and neutering information", + ], + "Pet Nutrition": [ + "Diet and nutrition advice", + "Special dietary needs", + "Recommended pet food brands", + "Homemade pet food recipes", + ], + "Grooming Services": [ + "Booking grooming appointments", + "Grooming service options", + "Self-grooming tips", + "Grooming for different breeds", + ], + "Training and Behavior": [ + "Puppy training classes", + "Behavioral issue consultation", + "Advanced training programs", + "Socialization tips for pets", + ], + "Pet Adoption and Fostering": [ + "Adoption process information", + "Available pets for adoption", + "Fostering program details", + "Post-adoption support", + ], + "Pet Supplies and Accessories": [ + "Pet supply recommendations", + "Pet accessory inquiries", + "Product safety information", + "Special offers on pet products", + ], + "Pet Boarding and Daycare": [ + "Boarding service options", + "Daycare facilities", + "Long-term stay arrangements", + "Special care services", + ], + "Health and Wellness": [ + "Pet health checkups", + "Dental care for pets", + "Wellness program information", + "Common health issues", + ], + "Insurance and Legal Services": [ + "Pet insurance options", + "Legal services for pet owners", + "Liability issues", + "Pet-related legal advice", + ], + "Emergency Preparedness": [ + "Emergency care for pets", + "First-aid for pets", + "Preparing pets for emergencies", + "Evacuation with pets", + ], + "Community and Social Events": [ + "Pet-friendly community events", + "Pet socialization meetups", + "Charity events for animals", + "Pet shows and competitions", + ], + "Travel and Accommodation": [ + "Traveling with pets", + "Pet-friendly accommodations", + "Pet travel regulations", + "Safety tips for traveling with pets", + ], + "End-of-Life Care": [ + "Euthanasia consultation", + "Grieving support", + "Memorial services for pets", + "Coping with pet loss", + ], +} diff --git a/geniusrise_text/nli/intents/public_safety.py b/geniusrise_text/nli/intents/public_safety.py new file mode 100644 index 0000000..277a3c7 --- /dev/null +++ b/geniusrise_text/nli/intents/public_safety.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Emergency Reporting": [ + "Report a crime in progress", + "Emergency medical assistance", + "Fire reporting", + "Traffic accidents", + ], + "Non-Emergency Assistance": [ + "Non-urgent police assistance", + "Medical inquiries", + "Reporting minor incidents", + "General public safety questions", + ], + "Preventive Measures and Awareness": [ + "Crime prevention tips", + "Public health advisories", + "Fire safety guidelines", + "Emergency preparedness information", + ], + "Traffic and Road Safety": [ + "Traffic condition inquiries", + "Road closure information", + "Vehicle safety checks", + "Reporting road hazards", + ], + "Community Policing": [ + "Neighborhood watch programs", + "Community policing initiatives", + "Safety workshops and seminars", + "Youth engagement programs", + ], + "Disaster Response and Relief": [ + "Natural disaster information", + "Disaster relief resources", + "Evacuation procedures", + "Recovery and aid efforts", + ], + "Environmental Hazards": [ + "Reporting pollution", + "Hazardous material spills", + "Wildlife encounters", + "Environmental health concerns", + ], + "Search and Rescue Operations": [ + "Missing person reports", + "Search and rescue requests", + "Mountain or water rescue", + "Lost and found inquiries", + ], + "Regulatory Compliance": [ + "Building safety regulations", + "Public event permits", + "Health and safety inspections", + "Compliance reporting", + ], + "Victim Assistance": [ + "Victim support services", + "Trauma and counseling services", + "Legal assistance for victims", + "Resource centers for victims", + ], + "Public Feedback and Complaints": [ + "Submit feedback on services", + "File a complaint", + "Suggestions for improvement", + "Commendations for service", + ], + "Training and Education Programs": [ + "Safety training for the public", + "Educational resources for schools", + "First aid and CPR classes", + "Emergency responder training", + ], + "Resource Allocation and Management": [ + "Resource distribution in crises", + "Emergency service resource inquiries", + "Volunteer and aid coordination", + "Public service announcements", + ], +} diff --git a/geniusrise_text/nli/intents/real_estate.py b/geniusrise_text/nli/intents/real_estate.py new file mode 100644 index 0000000..11e6882 --- /dev/null +++ b/geniusrise_text/nli/intents/real_estate.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Buying and Selling Properties": [ + "Property listings and viewings", + "Real estate market trends", + "Guidance on buying a property", + "Selling and marketing a property", + ], + "Rental and Leasing Information": [ + "Available rental properties", + "Lease agreement details", + "Tenant application process", + "Short-term and vacation rentals", + ], + "Property Management Services": [ + "Property maintenance requests", + "Rent collection and billing", + "Handling tenant complaints", + "Contractor and vendor management", + ], + "Financing and Mortgage": [ + "Mortgage application and rates", + "Refinancing options", + "Home equity loans", + "First-time homebuyer programs", + ], + "Investment and Market Analysis": [ + "Real estate investment opportunities", + "Market analysis and trends", + "Property valuation", + "Return on investment calculations", + ], + "Legal and Regulatory Advice": [ + "Property law and regulations", + "Zoning and land use", + "Legal disputes and resolutions", + "Compliance and permits", + ], + "Construction and Development": [ + "New construction projects", + "Renovation and remodeling advice", + "Sustainable and eco-friendly building", + "Development planning and permits", + ], + "Inspection and Appraisal": [ + "Scheduling property inspections", + "Home appraisal services", + "Safety and compliance checks", + "Condition assessment reports", + ], + "Neighborhood and Community": [ + "Community amenities and features", + "Schools and education", + "Safety and crime rates", + "Local businesses and services", + ], + "Home Staging and Presentation": [ + "Home staging tips", + "Interior design consultation", + "Curb appeal enhancement", + "Photography and virtual tours", + ], + "Property Marketing and Advertising": [ + "Marketing strategies for listings", + "Advertising channels and tools", + "Online listings and virtual showings", + "Social media marketing for properties", + ], + "Relocation Services": [ + "Relocation assistance", + "Moving and packing services", + "Settling-in services", + "International relocation advice", + ], + "Sustainability and Green Living": [ + "Eco-friendly housing options", + "Energy-efficient homes", + "Solar panels and renewable energy", + "Green building certifications", + ], +} diff --git a/geniusrise_text/nli/intents/religion.py b/geniusrise_text/nli/intents/religion.py new file mode 100644 index 0000000..df6189a --- /dev/null +++ b/geniusrise_text/nli/intents/religion.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Worship Services and Schedule": [ + "Service times and locations", + "Special religious ceremonies", + "Holiday and festival observances", + "Online and virtual worship options", + ], + "Spiritual Guidance and Counseling": [ + "Pastoral counseling sessions", + "Spiritual guidance inquiries", + "Meditation and prayer guidance", + "Dealing with spiritual crises", + ], + "Community Programs and Groups": [ + "Religious study groups", + "Youth and family ministries", + "Community outreach and volunteer opportunities", + "Interfaith activities and dialogues", + ], + "Educational Resources and Classes": [ + "Religious education classes", + "Spiritual workshops and retreats", + "Scripture study materials", + "Interfaith education programs", + ], + "Membership and Participation": [ + "Becoming a member", + "Volunteer roles and responsibilities", + "Participation in rituals and services", + "Membership contribution and donations", + ], + "Event Planning and Coordination": [ + "Weddings and religious ceremonies", + "Baptisms, confirmations, and other rites", + "Funeral and memorial services", + "Religious event planning and support", + ], + "Facilities and Accessibility": [ + "Facility rental and usage", + "Accessibility features", + "Parking and transportation", + "Childcare during services", + ], + "Counseling and Support Services": [ + "Grief and bereavement support", + "Marriage and family counseling", + "Addiction and recovery programs", + "Support for life transitions", + ], + "Charitable and Social Justice Work": [ + "Charity drives and donation opportunities", + "Social justice initiatives", + "Community service projects", + "Support for disadvantaged groups", + ], + "Health and Wellness": [ + "Health and wellness programs", + "Faith-based healing practices", + "Support for mental and physical health", + "Dietary and lifestyle guidance", + ], + "Art, Music, and Culture": [ + "Religious music and choir", + "Religious art and exhibitions", + "Cultural celebration events", + "Faith-based literature and media", + ], + "Prayer and Meditation": [ + "Prayer requests and support", + "Guided meditation sessions", + "Prayer groups and circles", + "Meditation and mindfulness resources", + ], + "Outreach and Mission Work": [ + "Mission trips and programs", + "Local and international outreach projects", + "Evangelism and community engagement", + "Partnerships with other organizations", + ], +} diff --git a/geniusrise_text/nli/intents/senior_care.py b/geniusrise_text/nli/intents/senior_care.py new file mode 100644 index 0000000..bc9de79 --- /dev/null +++ b/geniusrise_text/nli/intents/senior_care.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Healthcare Management": [ + "Medical appointment scheduling", + "Medication reminders and management", + "Chronic condition monitoring", + "Access to medical specialists", + ], + "Daily Living Assistance": [ + "Personal care and hygiene assistance", + "Meal preparation and diet planning", + "Mobility and transportation services", + "Housekeeping and maintenance help", + ], + "Social and Recreational Activities": [ + "Social event planning", + "Recreational activities and clubs", + "Exercise and wellness programs", + "Hobby and interest groups", + ], + "Safety and Emergency Services": [ + "Emergency response systems", + "Fall prevention and safety checks", + "24/7 monitoring services", + "Emergency contact arrangements", + ], + "Support and Counseling": [ + "Emotional support and counseling", + "Bereavement and grief support", + "Support groups for seniors", + "Family caregiver counseling", + ], + "Financial and Legal Assistance": [ + "Financial planning for seniors", + "Estate and legal planning", + "Insurance and benefits assistance", + "Fraud protection and prevention", + ], + "Respite and Caregiver Support": [ + "Short-term respite care options", + "Caregiver resources and support", + "Training and education for caregivers", + "Caregiver wellness and self-care", + ], + "Assisted Living and Housing": [ + "Assisted living facility information", + "Senior housing options", + "Accessibility and accommodation modifications", + "Transitioning to assisted living", + ], + "Nutrition and Diet": [ + "Nutritional counseling", + "Specialized diet plans", + "Dietary supplements and vitamins", + "Meal delivery services", + ], + "Technology and Communication": [ + "Technology training for seniors", + "Assistive communication devices", + "Online safety for seniors", + "Staying connected with family", + ], + "End-of-Life Care": [ + "Hospice care information", + "End-of-life planning", + "Palliative care services", + "Bereavement support services", + ], + "Community Engagement": [ + "Volunteer opportunities", + "Local community events", + "Senior advocacy and activism", + "Intergenerational programs", + ], +} diff --git a/geniusrise_text/nli/intents/social_services.py b/geniusrise_text/nli/intents/social_services.py new file mode 100644 index 0000000..2980a8e --- /dev/null +++ b/geniusrise_text/nli/intents/social_services.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Social Assistance Programs": [ + "Food assistance programs", + "Housing and shelter services", + "Utility assistance", + "Emergency financial aid", + ], + "Family and Childcare Services": [ + "Childcare support services", + "Family counseling", + "Parenting resources", + "Child welfare and protection", + ], + "Youth Programs and Services": [ + "After-school programs", + "Youth mentoring and coaching", + "Educational support for youth", + "Recreational activities for teenagers", + ], + "Senior Care Services": [ + "Elderly support programs", + "Senior community centers", + "Homecare assistance for seniors", + "Activities and events for older adults", + ], + "Health and Wellness Services": [ + "Mental health support", + "Substance abuse counseling", + "Health screening and clinics", + "Nutrition and fitness programs", + ], + "Employment and Training": [ + "Job search assistance", + "Career development workshops", + "Vocational training programs", + "Resume and interview preparation", + ], + "Disability Services": [ + "Accessibility resources", + "Support for persons with disabilities", + "Assistive technologies", + "Advocacy for disability rights", + ], + "Community Engagement and Volunteering": [ + "Volunteer opportunities", + "Community service projects", + "Civic engagement initiatives", + "Community groups and clubs", + ], + "Legal and Advocacy Services": [ + "Legal aid and counseling", + "Human rights advocacy", + "Immigration assistance", + "Consumer advocacy services", + ], + "Crisis and Emergency Services": [ + "Crisis intervention", + "Disaster relief and recovery", + "Emergency shelter", + "Hotline and support services", + ], + "Cultural and Diversity Programs": [ + "Cultural awareness initiatives", + "Diversity and inclusion programs", + "Multicultural events", + "Language and translation services", + ], + "Environmental and Sustainability Initiatives": [ + "Community recycling programs", + "Sustainability education", + "Green living workshops", + "Environmental conservation projects", + ], + "Financial Counseling": [ + "Debt management advice", + "Financial literacy programs", + "Budgeting and financial planning", + "Credit counseling", + ], +} diff --git a/geniusrise_text/nli/intents/sports_centers.py b/geniusrise_text/nli/intents/sports_centers.py new file mode 100644 index 0000000..65c29df --- /dev/null +++ b/geniusrise_text/nli/intents/sports_centers.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Facility Information": [ + "Types of sports facilities available", + "Operating hours and location", + "Equipment rental and usage", + "Accessibility and special accommodations", + ], + "Membership and Registration": [ + "Membership options and benefits", + "Registration process for activities", + "Membership renewal and cancellation", + "Family and group membership plans", + ], + "Programs and Activities": [ + "Fitness class schedules", + "Sports leagues and tournaments", + "Recreational activities for all ages", + "Seasonal programs and camps", + ], + "Personal Training and Coaching": [ + "Personal trainer services", + "Sport-specific coaching", + "Fitness assessment and goal setting", + "Group training sessions", + ], + "Event Hosting and Parties": [ + "Facility rental for events", + "Birthday party packages", + "Corporate events and team building", + "Sporting event organization", + ], + "Health and Wellness": [ + "Nutrition and diet consultation", + "Wellness workshops and seminars", + "Health screenings and assessments", + "Mind-body fitness programs", + ], + "Youth Sports and Activities": [ + "Youth sports programs", + "After-school activities", + "Summer camps for kids", + "Youth development initiatives", + ], + "Safety and Emergency Procedures": [ + "Safety guidelines and policies", + "Emergency response procedures", + "First aid and CPR training", + "Water safety and lifeguard services", + ], + "Special Offers and Promotions": [ + "Discounts and special rates", + "Referral and loyalty programs", + "Seasonal promotions", + "Gift cards and vouchers", + ], + "Community Engagement": [ + "Community outreach events", + "Volunteer opportunities", + "Local sports and recreation partnerships", + "Charity and fundraising events", + ], + "Feedback and Improvement": [ + "Member feedback and surveys", + "Suggestions for new programs or services", + "Complaint resolution process", + "Testimonials and success stories", + ], + "Outdoor and Adventure Sports": [ + "Outdoor recreation areas", + "Adventure sports programs", + "Guided outdoor excursions", + "Equipment and safety for outdoor activities", + ], + "Aquatic Programs": [ + "Swimming pool schedule", + "Aquatic fitness classes", + "Swim lessons for all ages", + "Lifeguard training and certification", + ], +} diff --git a/geniusrise_text/nli/intents/tourist_guide.py b/geniusrise_text/nli/intents/tourist_guide.py new file mode 100644 index 0000000..22f8a93 --- /dev/null +++ b/geniusrise_text/nli/intents/tourist_guide.py @@ -0,0 +1,95 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Local Attractions": [ + "Popular tourist sites", + "Hidden gems and offbeat places", + "Cultural and historical landmarks", + "Family-friendly attractions", + ], + "Accommodation Recommendations": [ + "Hotel and lodging options", + "Budget-friendly stays", + "Luxury and boutique accommodations", + "Unique lodging experiences", + ], + "Dining and Cuisine": [ + "Local food and cuisine recommendations", + "Restaurant reservations", + "Street food and local markets", + "Dietary restriction accommodations", + ], + "Transportation and Navigation": [ + "Public transportation options", + "Car rental and taxi services", + "Walking and biking routes", + "Airport transfer information", + ], + "Guided Tours and Excursions": [ + "City tour bookings", + "Adventure and outdoor activities", + "Cultural and thematic tours", + "Private and group tour options", + ], + "Event and Festival Information": [ + "Local events and festivals", + "Concerts and performances", + "Seasonal and annual events", + "Cultural and religious celebrations", + ], + "Shopping and Souvenirs": [ + "Shopping districts and streets", + "Local markets and craft fairs", + "Souvenir and gift recommendations", + "Specialty stores and boutiques", + ], + "Safety and Emergency Information": [ + "Local emergency services", + "Safety tips for tourists", + "Consulate and embassy information", + "Healthcare and medical facilities", + ], + "Language and Communication": [ + "Language translation services", + "Common phrases and language tips", + "Communication and mobile services", + "Cultural etiquette and customs", + ], + "Travel Tips and Advice": [ + "Packing and preparation tips", + "Budget travel advice", + "Best times to visit", + "Traveling with children or pets", + ], + "Outdoor and Nature Activities": [ + "Hiking and trekking information", + "Beach and water activities", + "National parks and nature reserves", + "Wildlife watching and eco-tourism", + ], + "Cultural Workshops and Learning": [ + "Local workshops and classes", + "Cultural immersion experiences", + "Cooking and art classes", + "Historical and educational tours", + ], + "Accessibility and Inclusive Tourism": [ + "Accessible travel information", + "Facilities for travelers with disabilities", + "Inclusive tour and activity options", + "Support services for diverse needs", + ], +} diff --git a/geniusrise_text/nli/intents/translation.py b/geniusrise_text/nli/intents/translation.py new file mode 100644 index 0000000..e106ee0 --- /dev/null +++ b/geniusrise_text/nli/intents/translation.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Translation Requests": [ + "Document translation services", + "Website and digital content translation", + "Technical and specialized translation", + "Urgent translation needs", + ], + "Interpretation Services": [ + "Simultaneous interpretation for events", + "Consecutive interpretation services", + "Over-the-phone interpreting", + "Sign language interpretation", + ], + "Localization and Cultural Adaptation": [ + "Localization of software and apps", + "Cultural adaptation of marketing materials", + "Localization consulting and strategy", + "Multilingual content localization", + ], + "Language Pairs and Expertise": [ + "Available language pairs", + "Expertise in specific languages", + "Industry-specific language services", + "Rare and less common languages", + ], + "Certified and Legal Translations": [ + "Certified document translations", + "Legal and official translations", + "Notarization of translated documents", + "Confidentiality in legal translations", + ], + "Pricing and Quotes": [ + "Translation service pricing", + "Requesting a quote for translation", + "Bulk and project-based pricing", + "Discounts for recurring clients", + ], + "Quality Assurance and Review": [ + "Translation quality standards", + "Review and proofreading services", + "Handling revisions and feedback", + "Quality control processes", + ], + "Audio and Video Translations": [ + "Subtitle and caption translation", + "Voice-over and dubbing services", + "Transcription and translation of audio", + "Multimedia localization", + ], + "Professional Qualifications": [ + "Credentials of translators", + "Training and certification", + "Experience in specific industries", + "Professional language associations", + ], + "Technological Capabilities": [ + "Use of translation software and tools", + "Machine translation and AI assistance", + "Integration with client systems", + "Data security and confidentiality", + ], + "Customer Support and Communication": [ + "Project management and communication", + "Customer support for translation inquiries", + "Handling specific client requests", + "Ongoing client relationship management", + ], + "Educational and Training Services": [ + "Language training and workshops", + "Translation and interpretation courses", + "Cultural competency training", + "Language skill assessments", + ], +} diff --git a/geniusrise_text/nli/intents/travel.py b/geniusrise_text/nli/intents/travel.py new file mode 100644 index 0000000..5deedaa --- /dev/null +++ b/geniusrise_text/nli/intents/travel.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Flight Reservations": [ + "Book a flight", + "Flight availability", + "Airline ticket prices", + "Group bookings", + ], + "Accommodation Booking": [ + "Hotel reservation", + "Check room availability", + "Book a vacation rental", + "Special accommodation requests", + ], + "Travel Packages": [ + "Tour packages", + "Family vacation deals", + "All-inclusive travel offers", + "Last-minute travel deals", + ], + "Itinerary Changes": [ + "Change travel dates", + "Modify flight details", + "Reschedule hotel booking", + "Cancel a booking", + ], + "Travel Documentation": [ + "Visa assistance", + "Travel insurance options", + "Passport-related queries", + "Health and safety documentation", + ], + "Transportation Services": [ + "Car rental services", + "Airport shuttle services", + "Local transportation options", + "Public transport information", + ], + "Local Attractions": [ + "Tourist attraction information", + "Local event schedule", + "Guided tour bookings", + "Cultural and recreational activities", + ], + "Travel Assistance": [ + "Emergency assistance", + "Language support services", + "Currency exchange information", + "Lost and found queries", + ], + "Special Requests": [ + "Dietary and accessibility needs", + "Pet travel accommodations", + "Special occasion arrangements", + "Custom travel requests", + ], + "Loyalty Programs": [ + "Frequent flyer benefits", + "Hotel loyalty program details", + "Redeeming loyalty points", + "Membership tier queries", + ], + "Feedback and Complaints": [ + "Submit travel feedback", + "Report a service issue", + "Complaint resolution", + "Review a travel experience", + ], + "Trip Planning Advice": [ + "Destination recommendations", + "Travel itinerary suggestions", + "Seasonal travel tips", + "Budget travel advice", + ], +} diff --git a/geniusrise_text/nli/intents/wedding.py b/geniusrise_text/nli/intents/wedding.py new file mode 100644 index 0000000..6aee388 --- /dev/null +++ b/geniusrise_text/nli/intents/wedding.py @@ -0,0 +1,101 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Venue Selection and Booking": [ + "Wedding venue options", + "Venue booking procedures", + "Outdoor vs. indoor venue suggestions", + "Venue capacity and layout details", + ], + "Wedding Themes and Styles": [ + "Theme ideas and inspiration", + "Color scheme and style guidance", + "Seasonal wedding themes", + "Cultural and traditional wedding styles", + ], + "Catering and Menu Planning": [ + "Wedding catering services", + "Menu tasting and selection", + "Dietary restrictions and custom menus", + "Catering budget and pricing", + ], + "Photography and Videography": [ + "Wedding photographer and videographer", + "Photography style options", + "Pre-wedding photo shoots", + "Videography packages and rates", + ], + "Floral and Decor": [ + "Floral arrangement choices", + "Wedding decor themes", + "Ceremony and reception decor", + "Seasonal flowers and decorations", + ], + "Entertainment and Music": [ + "Wedding music and entertainment", + "DJ and live band options", + "Ceremony and reception music", + "Dance floor and lighting setup", + ], + "Wedding Attire and Dress": [ + "Bridal gown and groom's attire", + "Bridesmaids' and groomsmen's outfits", + "Wedding dress fittings", + "Tuxedo and dress rental options", + ], + "Invitations and Guest Management": [ + "Wedding invitation design", + "Guest list management", + "RSVP tracking", + "Digital and traditional invitations", + ], + "Transportation and Accommodations": [ + "Wedding day transportation", + "Guest accommodation options", + "Shuttle services for guests", + "Parking arrangements", + ], + "Wedding Day Coordination": [ + "Day-of coordination services", + "Wedding timeline planning", + "Vendor coordination", + "Emergency wedding day solutions", + ], + "Budgeting and Financial Planning": [ + "Wedding budget planning", + "Cost-saving tips", + "Payment schedules for vendors", + "Financial tracking tools", + ], + "Pre-Wedding Events": [ + "Engagement party planning", + "Bachelor and bachelorette parties", + "Bridal showers", + "Rehearsal dinner coordination", + ], + "Post-Wedding Services": [ + "Honeymoon planning", + "Thank you notes and gifts", + "Wedding album and video delivery", + "Post-wedding cleanup and returns", + ], + "Ceremony and Vow Assistance": [ + "Ceremony script and structure", + "Writing personalized vows", + "Ceremony officiants", + "Cultural and religious ceremony elements", + ], +} diff --git a/geniusrise_text/nli/intents/wellness.py b/geniusrise_text/nli/intents/wellness.py new file mode 100644 index 0000000..ec42b19 --- /dev/null +++ b/geniusrise_text/nli/intents/wellness.py @@ -0,0 +1,89 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +intents = { + "Beauty Treatments and Services": [ + "Facial and skincare treatments", + "Hair styling and coloring services", + "Manicure and pedicure appointments", + "Makeup services and consultations", + ], + "Spa and Massage Services": [ + "Spa treatment options", + "Massage therapy types", + "Wellness spa packages", + "Booking spa appointments", + ], + "Wellness Programs": [ + "Detox and cleanse programs", + "Weight management plans", + "Holistic wellness therapies", + "Stress relief and relaxation sessions", + ], + "Product Recommendations": [ + "Skincare product advice", + "Haircare products", + "Wellness and health supplements", + "Organic and natural beauty products", + ], + "Appointment Scheduling": [ + "Booking appointments", + "Rescheduling or canceling services", + "Service duration and availability", + "Walk-in appointment policies", + ], + "Special Offers and Memberships": [ + "Membership benefits and tiers", + "Current promotions and discounts", + "Loyalty program details", + "Gift cards and vouchers", + ], + "Safety and Hygiene Protocols": [ + "COVID-19 safety measures", + "Salon and spa hygiene practices", + "Client safety and comfort", + "Allergy and sensitivity precautions", + ], + "Training and Workshops": [ + "Beauty and wellness training courses", + "Professional certification programs", + "Workshops and seminars", + "Employee training and development", + ], + "Event Services": [ + "Bridal and wedding services", + "Event makeup and hairstyling", + "Group bookings for special events", + "Custom beauty packages for events", + ], + "Feedback and Quality Assurance": [ + "Client feedback and reviews", + "Service quality assurance", + "Handling complaints and issues", + "Suggestions for service improvement", + ], + "Health and Safety Information": [ + "Information on contraindications", + "Client health and safety guidelines", + "Dealing with adverse reactions", + "Safety certifications", + ], + "Industry Trends and News": [ + "Latest trends in beauty and wellness", + "Industry news and updates", + "New treatment techniques", + "Upcoming events and conferences", + ], +} diff --git a/geniusrise_text/nli/tests/__init__.py b/geniusrise_text/nli/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/nli/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/nli/tests/test_bulk.py b/geniusrise_text/nli/tests/test_bulk.py new file mode 100644 index 0000000..84d2440 --- /dev/null +++ b/geniusrise_text/nli/tests/test_bulk.py @@ -0,0 +1,30 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7 +# google/t5_xxl_true_nli_mixture +# facebook/bart-large-mnli +# microsoft/deberta-v2-xlarge-mnli +# khalidalt/DeBERTa-v3-large-mnli +# typeform/distilbert-base-uncased-mnli +# roberta-large-mnli +# microsoft/deberta-v2-xxlarge-mnli +# sileod/deberta-v3-large-tasksource-nli +# cross-encoder/nli-deberta-v3-small +# cross-encoder/nli-deberta-v3-base +# cross-encoder/nli-deberta-v3-large +# cross-encoder/nli-roberta-base +# cross-encoder/nli-deberta-v3-large +# cross-encoder/nli-distilroberta-base diff --git a/geniusrise_text/nli/tests/test_fine_tune.py b/geniusrise_text/nli/tests/test_fine_tune.py new file mode 100644 index 0000000..68bb00e --- /dev/null +++ b/geniusrise_text/nli/tests/test_fine_tune.py @@ -0,0 +1,273 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import numpy as np +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq +from transformers import EvalPrediction + +from geniusrise_text.nli.fine_tune import NLIFineTuner + +lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", +} + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"premise": f"premise_{i}", "hypothesis": f"hypothesis_{i}", "label": i % 2} for i in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "premise").text = item["premise"] + ET.SubElement(record, "hypothesis").text = item["hypothesis"] + ET.SubElement(record, "label").text = str(item["label"]) + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "jsonl", + "parquet", + "json", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir + "/train", ext) + create_dataset_in_format(tmpdir + "/eval", ext) + return tmpdir, ext + + +MODELS_TO_TEST = { + # fmt: off + "small": "facebook/bart-large-mnli", + # fmt: on +} + + +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + +@pytest.fixture +def commonsense_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + klass = NLIFineTuner( + input=input, + output=output, + state=state, + ) + return klass + + +def test_commonsense_bolt_init(commonsense_bolt, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForSequenceClassification" + tokenizer_class = "AutoTokenizer" + + commonsense_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + assert commonsense_bolt.model is not None + assert commonsense_bolt.tokenizer is not None + assert commonsense_bolt.input is not None + assert commonsense_bolt.output is not None + assert commonsense_bolt.state is not None + + +def test_load_dataset_all_formats(commonsense_bolt, dataset_file, model): + tmpdir, ext = dataset_file + dataset_path = os.path.join(tmpdir, "train") + + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForSequenceClassification" + tokenizer_class = "AutoTokenizer" + + commonsense_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map="cuda:0", + ) + + dataset = commonsense_bolt.load_dataset(dataset_path) + assert dataset is not None + assert len(dataset) == 10 + + +# Models to test +models = { + # fmt: off + "bart": "facebook/bart-large-mnli", + "deberta": "microsoft/deberta-v2-xlarge-mnli", + "large": "khalidalt/DeBERTa-v3-large-mnli", + "distill": "typeform/distilbert-base-uncased-mnli", + "roberta": "roberta-large-mnli", + "biggest": "microsoft/deberta-v2-xxlarge-mnli", + "tasksource": "sileod/deberta-v3-large-tasksource-nli", + "deberta-v3-large": "cross-encoder/nli-deberta-v3-large", + # fmt: on +} + + +# Test for fine-tuning +@pytest.mark.parametrize( + "model_name, precision, quantization, lora_config, use_accelerate", + [ + # small + (models["bart"], "bfloat16", None, None, False), + (models["deberta"], "bfloat16", None, None, False), + (models["large"], "bfloat16", None, None, False), + (models["distill"], "bfloat16", None, None, False), + (models["roberta"], "bfloat16", None, None, False), + (models["biggest"], "bfloat16", None, None, False), + (models["tasksource"], "bfloat16", None, None, False), + (models["deberta-v3-large"], "bfloat16", None, None, False), + ], +) +def test_commonsense_bolt_fine_tune( + commonsense_bolt, dataset_file, model_name, precision, quantization, lora_config, use_accelerate +): + try: + tokenizer_name = model_name + + tmpdir, ext = dataset_file + commonsense_bolt.input.input_folder = tmpdir + + commonsense_bolt.fine_tune( + model_name=model_name, + tokenizer_name=model_name, + model_class="AutoModelForSequenceClassification", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map="cuda:0", + data_masked=False, + ) + output_dir = commonsense_bolt.output.output_folder + assert os.path.exists( + os.path.join(commonsense_bolt.output.output_folder, "model", "pytorch_model.bin") + ) or os.path.exists(os.path.join(commonsense_bolt.output.output_folder, "model", "adapter_model.bin")) + assert os.path.exists( + os.path.join(commonsense_bolt.output.output_folder, "model", "config.json") + ) or os.path.exists(os.path.join(commonsense_bolt.output.output_folder, "model", "adapter_config.json")) + assert os.path.exists(os.path.join(commonsense_bolt.output.output_folder, "model", "training_args.bin")) + + del commonsense_bolt.model + del commonsense_bolt.tokenizer + torch.cuda.empty_cache() + + try: + os.remove(os.path.join(commonsense_bolt.output.output_folder, "model", "pytorch_model.bin")) + os.remove(os.path.join(commonsense_bolt.output.output_folder, "model", "adapter_model.bin")) + os.remove(os.path.join(commonsense_bolt.output.output_folder, "model", "config.json")) + os.remove(os.path.join(commonsense_bolt.output.output_folder, "model", "adapter_config.json")) + os.remove(os.path.join(commonsense_bolt.output.output_folder, "model", "training_args.bin")) + except Exception as _: + pass + + except Exception as e: + del commonsense_bolt.model + del commonsense_bolt.tokenizer + torch.cuda.empty_cache() + raise + + +# Test for computing metrics +def test_commonsense_bolt_compute_metrics(commonsense_bolt): + logits = np.array([[0.6, 0.4], [0.4, 0.6]]) + labels = np.array([0, 1]) + eval_pred = EvalPrediction(predictions=logits, label_ids=labels) + metrics = commonsense_bolt.compute_metrics(eval_pred) + assert "accuracy" in metrics + assert "precision" in metrics + assert "recall" in metrics + assert "f1" in metrics diff --git a/geniusrise_text/notebook/__init__.py b/geniusrise_text/notebook/__init__.py new file mode 100644 index 0000000..6d0778e --- /dev/null +++ b/geniusrise_text/notebook/__init__.py @@ -0,0 +1,16 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .notebook import TextJupyterNotebook diff --git a/geniusrise_text/notebook/notebook.py b/geniusrise_text/notebook/notebook.py new file mode 100644 index 0000000..bc28726 --- /dev/null +++ b/geniusrise_text/notebook/notebook.py @@ -0,0 +1,239 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import subprocess +import sys +from jinja2 import Environment, FileSystemLoader, Template +from nbformat import v4 as nbf +import nbformat +from geniusrise import BatchInput, BatchOutput, Bolt, State +from geniusrise.logging import setup_logger +from typing import Any, Dict, List, Optional +from geniusrise_text.base.communication import send_email + + +class TextJupyterNotebook(Bolt): + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs, + ): + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + script_dir = os.path.dirname(os.path.realpath(__file__)) + templates_dir = os.path.join(script_dir, "templates") + + # Initialize Jinja2 Environment with the correct templates directory + self.env = Environment(loader=FileSystemLoader(templates_dir)) + + def create( + self, + model_name: str, + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + port: int = 8888, + password: Optional[str] = None, + notification_email: Optional[str] = None, + **model_args: Any, + ): + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.torchscript = torchscript + self.compile = compile + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.port = port + self.password = password + self.notification_email = notification_email + self.model_args = model_args + + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.model_revision = model_revision + self.tokenizer_name = tokenizer_name + self.tokenizer_revision = tokenizer_revision + + self.env = Environment(loader=FileSystemLoader("./templates")) + + # Context for Jinja template + context = { + "model_name": model_name, + "tokenizer_name": tokenizer_name, + "model_revision": model_revision, + "tokenizer_revision": tokenizer_revision, + "model_class": model_class, + "tokenizer_class": tokenizer_class, + "use_cuda": use_cuda, + "precision": precision, + "quantization": quantization, + "device_map": device_map, + "torchscript": torchscript, + "compile": compile, + "awq_enabled": awq_enabled, + "flash_attention": flash_attention, + "model_args": model_args, + } + + import os + + dir_path = os.path.dirname(os.path.realpath(__file__)) + + output_path = self.output.output_folder + + script_dir = os.path.dirname(os.path.abspath(__file__)) + templates_dir = os.path.join(script_dir, "templates") + # fmt: off + class_to_template_map = { + "AutoModelForCausalLM": os.path.join(templates_dir, "AutoModelForCausalLM.jinja"), + "AutoModelForTokenClassification": os.path.join(templates_dir, "AutoModelForTokenClassification.jinja"), + "AutoModelForSequenceClassification": os.path.join(templates_dir, "AutoModelForSequenceClassification.jinja"), + "AutoModelForTableQuestionAnswering": os.path.join(templates_dir, "AutoModelForTableQuestionAnswering.jinja"), + "AutoModelForQuestionAnswering": os.path.join(templates_dir, "AutoModelForQuestionAnswering.jinja"), + "AutoModelForSeq2SeqLM": os.path.join(templates_dir, "AutoModelForSeq2SeqLM.jinja"), + } + # fmt: on + + template_name = class_to_template_map[model_class] + + self.create_notebook(name=template_name, context=context, output_path=f"{output_path}/notebook.ipynb") + + self.install_packages( + [ + "jupyterthemes", + "jupyter==1.0.0", + "jupyterlab_legos_ui", + "jupyterlab_darkside_ui", + "theme-darcula", + "jupyterlab", + # "notebook==6.4.12", + # "jupyter_contrib_nbextensions", + ] + ) + + # subprocess.run("jupyter contrib nbextension install --user".split(" "), check=True) + + # self.install_jupyter_extensions( + # [ + # "@yudai-nkt/jupyterlab_city-lights-theme", + # "@yeebc/jupyterlab_neon_theme", + # "@jupyterlab/apputils", + # "@jupyterlab/git", + # "@jupyterlab/github", + # "@jupyterlab/google-drive", + # "@jupyter-ai/core", + # ] + # ) + + self.start_jupyter_server(notebook_dir=output_path, port=port, password=password) + self.done() + + def create_notebook(self, name: str, context: dict, output_path: str): + """ + Create a Jupyter Notebook from a Jinja template. + + Args: + context (dict): Context variables to render the template. + output_path (str): Path to save the generated notebook. + """ + # template = self.env.get_template(name) + with open(name, "r") as file: + template_content = file.read() + + template = Template(template_content) + + notebook_json = template.render(context) + notebook = nbf.reads(notebook_json) + + with open(output_path, "w") as f: + nbformat.write(notebook, f) + self.log.info(f"Notebook created at {output_path}") + + def start_jupyter_server(self, notebook_dir: str, port: int = 8888, password: Optional[str] = None): + """ + Start a Jupyter Notebook server in the specified directory with an optional port and password. + + Args: + notebook_dir (str): Directory where the notebook server should start. + port (int): Port number for the notebook server. Default is 8888. + password (Optional[str]): Password for accessing the notebook server. If None, no password is set. + """ + + command = [ + "jupyter", + "lab", + # f"--ServerApp.password=''", + "--ip=0.0.0.0", + f"--ServerApp.token={password}", + "--no-browser", + "--port", + str(port), + "--ServerApp.root_dir", + notebook_dir, + ] + self.log.info(f"Running command {' '.join(command)}") + + subprocess.run(command, check=True) # type: ignore + + def install_packages(self, packages: List[str]): + """ + Install Python packages using pip. + + Args: + packages (List[str]): List of package names to install. + """ + for package in packages: + subprocess.run([sys.executable, "-m", "pip", "install", package], check=True) + self.log.info("Required packages installed.") + + def install_jupyter_extensions(self, extensions: List[str]): + """ + Install Jupyter Notebook extensions. + + Args: + extensions (List[str]): List of Jupyter extension names to install. + """ + for extension in extensions: + subprocess.run(["jupyter", "labextension", "install", extension], check=True) + subprocess.run(["jupyter", "labextension", "enable", extension], check=True) + self.log.info("Jupyter extensions installed and enabled.") + + def done(self): + if self.notification_email: + self.output.flush() + send_email(recipient=self.notification_email, bucket_name=self.output.bucket, prefix=self.output.s3_folder) diff --git a/geniusrise_text/notebook/templates/AutoModelForCausalLM.jinja b/geniusrise_text/notebook/templates/AutoModelForCausalLM.jinja new file mode 100644 index 0000000..a2087b3 --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForCausalLM.jinja @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Tutorial\n", + "\n", + "Welcome to the comprehensive tutorial on `{{ model_class }}` from Hugging Face's Transformers library. This notebook will guide you through the process of loading, configuring, and utilizing the `{{ model_class }}` for generating text. We'll cover several key aspects including model loading, setting up the tokenizer, configuring the model for your hardware, and performing text generation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting Up the Environment\n", + "\n", + "Before we begin, let's set up our environment by importing the necessary libraries and checking for GPU availability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing the Transformers library and torch\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n", + "\n", + "# Checking for GPU availability\n", + "device = 'cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu'\n", + "print(f'Using device: {device}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Model and Tokenizer\n", + "\n", + "In this section, we will load the `{{ model_class }}` and its corresponding tokenizer. We will use the `{{ model_name }}` model along with its respective tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the model\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}')\n", + "\n", + "# Configuring the device\n", + "model = model.to(device)\n", + "\n", + "# Precision configuration (optional)\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Explain the importance of each configuration..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generating Text with the Model\n", + "\n", + "Now that our model and tokenizer are set up, we can use them to generate text. This section will demonstrate how to provide a prompt to the model and generate a response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Text generation\n", + "prompt = 'Today is a beautiful day'\n", + "inputs = tokenizer(prompt, return_tensors='pt')\n", + "inputs = inputs.to(device)\n", + "\n", + "# Generating a response\n", + "with torch.no_grad():\n", + " outputs = model.generate(**inputs)\n", + " generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + "\n", + "print('Generated Text:\\n', generated_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Tips and Tricks\n", + "\n", + "This section provides additional tips and tricks for optimizing model performance, handling different types of inputs, and troubleshooting common issues." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this tutorial, we have covered the basics of using `{{ model_class }}` from Hugging Face's Transformers library. Remember to explore the library's extensive documentation for more advanced use cases and techniques." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-Tuning the Model\n", + "\n", + "In this section, we'll fine-tune `{{ model_class }}` on a specific task. Fine-tuning involves training the model on a smaller dataset for a particular task. We'll use a sample dataset for this demonstration." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preparing the Dataset\n", + "\n", + "We'll start by preparing our dataset for fine-tuning. We'll load a sample dataset, preprocess it, and prepare it in a format suitable for training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sample dataset preparation\n", + "from datasets import load_dataset\n", + "dataset = load_dataset('sample_dataset_name')\n", + "\n", + "# Preprocessing the data\n", + "def preprocess_data(example):\n", + " # Add your preprocessing steps here\n", + " return example\n", + "\n", + "dataset = dataset.map(preprocess_data)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting Up Training Parameters\n", + "\n", + "Now, let's set up our training parameters, including the optimizer, learning rate, and training epochs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AdamW\n", + "\n", + "# Optimizer\n", + "optimizer = AdamW(model.parameters(), lr=5e-5)\n", + "\n", + "# Training parameters\n", + "num_epochs = 3\n", + "batch_size = 16\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Loop\n", + "\n", + "We'll now define and run the training loop. This loop will update the model's weights based on our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "\n", + "model.train()\n", + "for epoch in range(num_epochs):\n", + " loop = tqdm(dataset['train'], leave=True)\n", + " for batch in loop:\n", + " # Add your training steps here\n", + " pass\n", + "\n", + " # Optionally add validation and saving checkpoints" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the Fine-Tuned Model\n", + "\n", + "After fine-tuning, let's evaluate our model on a validation set to see the improvements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "# Add code to evaluate the model on the validation set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "This tutorial covered how to fine-tune `{{ model_class }}` for a specific task. Remember to explore various datasets, hyperparameters, and techniques to get the best results for your use case." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForQuestionAnswering.jinja b/geniusrise_text/notebook/templates/AutoModelForQuestionAnswering.jinja new file mode 100644 index 0000000..4066d47 --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForQuestionAnswering.jinja @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Question Answering Tutorial\n", + "\n", + "Welcome to this detailed tutorial on using the `{{ model_class }}` for question answering tasks, utilizing the Hugging Face Transformers library. We'll guide you through the process of setting up the model, preparing your data, and using the model to extract answers from text." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "First, let's import the necessary libraries and set up our environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing the Transformers library and torch\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n", + "\n", + "# Checking for GPU availability\n", + "device = 'cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu'\n", + "print(f'Using device: {device}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model and Tokenizer Loading\n", + "\n", + "Now, let's load the `{{ model_class }}` and its corresponding tokenizer. We are using the `{{ model_name }}` model for our question answering task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}')\n", + "\n", + "# Setting up the device\n", + "model = model.to(device)\n", + "\n", + "# Configuring precision if necessary\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Explanation of each configuration and its impact..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performing Question Answering\n", + "\n", + "In this section, we'll demonstrate how to perform question answering with our model. We'll provide a context paragraph and a question, and the model will identify the answer within the context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define context and question\n", + "context = '...' # Provide the context text here\n", + "question = '...' # Provide the question here\n", + "\n", + "# Tokenizing and encoding the context and question\n", + "inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors='pt')\n", + "inputs = inputs.to(device)\n", + "\n", + "# Performing question answering\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits\n", + " answer_start = torch.argmax(answer_start_scores)\n", + " answer_end = torch.argmax(answer_end_scores) + 1\n", + " answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))\n", + "\n", + "# Displaying the answer\n", + "print('Extracted Answer:\\n', answer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tips and Best Practices\n", + "\n", + "This section includes additional tips for optimizing the model's performance, handling various types of data, and troubleshooting common issues." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "You have now learned how to use `{{ model_class }}` for question answering tasks. Explore the Transformers library for more advanced functionalities and applications." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-Tuning the Model for Question Answering\n", + "\n", + "In this section, we'll fine-tune `{{ model_class }}` on a question answering task. Fine-tuning involves adapting the pre-trained model to a specific task with additional training. We'll use a question answering dataset for this purpose." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preparing the Question Answering Dataset\n", + "\n", + "First, we need to load and prepare a question answering dataset. We'll use the SQuAD dataset for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the SQuAD dataset\n", + "from datasets import load_dataset\n", + "dataset = load_dataset('squad')\n", + "\n", + "# Preprocessing function\n", + "def preprocess_data(examples):\n", + " # Tokenize the questions and contexts\n", + " # Note: Truncate/pad based on the model's max input length\n", + " # and the specific requirements of the task\n", + " return tokenizer(examples['question'], examples['context'], truncation=True, padding='max_length')\n", + "\n", + "# Preprocess the dataset\n", + "dataset = dataset.map(preprocess_data, batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configuring the Training Parameters\n", + "\n", + "Now, let's set up the training parameters and the optimizer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AdamW\n", + "\n", + "# Optimizer\n", + "optimizer = AdamW(model.parameters(), lr=5e-5)\n", + "\n", + "# Training parameters\n", + "num_epochs = 3\n", + "batch_size = 8 # Adjust based on the available GPU memory" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The Training Loop\n", + "\n", + "Let's define the training loop. During training, the model will learn to predict the start and end positions of the answer in the context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm\n", + "\n", + "model.train()\n", + "for epoch in range(num_epochs):\n", + " progress_bar = tqdm(dataset['train'], desc=f'Epoch {epoch + 1}', leave=False)\n", + " for batch in progress_bar:\n", + " # Forward pass\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " # Backward pass\n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + " progress_bar.set_postfix({'loss': loss.item()})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the Fine-Tuned Model\n", + "\n", + "After fine-tuning, it's important to evaluate the model's performance on the validation dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "# Add evaluation code to assess model performance on validation data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "You have now learned how to fine-tune `{{ model_class }}` for a question answering task. Experiment with different datasets, hyperparameters, and training techniques to optimize performance for your specific use case." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForSeq2SeqLM.jinja b/geniusrise_text/notebook/templates/AutoModelForSeq2SeqLM.jinja new file mode 100644 index 0000000..8c948f5 --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForSeq2SeqLM.jinja @@ -0,0 +1,256 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Sequence-to-Sequence Tutorial\n", + "\n", + "Welcome to this comprehensive tutorial on using `{{ model_class }}` for sequence-to-sequence tasks, leveraging the Hugging Face Transformers library. This notebook will guide you through setting up the model, preparing your data, and using the model for tasks such as translation, summarization, and more." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "Let's start by setting up our environment and importing the necessary libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing the Transformers library and torch\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n", + "\n", + "# Checking for GPU availability\n", + "device = 'cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu'\n", + "print(f'Using device: {device}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model and Tokenizer Loading\n", + "\n", + "Next, we'll load the `{{ model_class }}` and its corresponding tokenizer, using the `{{ model_name }}` model for our sequence-to-sequence tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}')\n", + "\n", + "# Setting up the model on the right device\n", + "model = model.to(device)\n", + "\n", + "# Configuring model precision if necessary\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Explaining each configuration and its impact..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sequence-to-Sequence Task Demonstration\n", + "\n", + "In this section, we will demonstrate how to use the `{{ model_class }}` for a sequence-to-sequence task. We will provide a source sequence, and the model will generate a corresponding target sequence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Defining a source sequence for demonstration\n", + "source_sequence = '...' # Add your source sequence here\n", + "inputs = tokenizer(source_sequence, return_tensors='pt')\n", + "inputs = inputs.to(device)\n", + "\n", + "# Generating the target sequence\n", + "with torch.no_grad():\n", + " outputs = model.generate(**inputs)\n", + " target_sequence = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + "\n", + "# Displaying the target sequence\n", + "print('Generated Target Sequence:\\n', target_sequence)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Tips and Best Practices\n", + "\n", + "This section includes tips on optimizing model performance, handling different types of sequences, and troubleshooting common issues in sequence-to-sequence tasks." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "You have now learned the basics of using `{{ model_class }}` for sequence-to-sequence tasks. Don't forget to explore the Transformers library further for advanced features and applications." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-Tuning the Model for Sequence-to-Sequence Tasks\n", + "\n", + "In this section, we'll fine-tune `{{ model_class }}` for a specific sequence-to-sequence task. Fine-tuning involves adapting the pre-trained model to a specific task with additional training. We'll choose a suitable dataset for a task like translation or summarization." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preparing the Dataset\n", + "\n", + "We'll start by preparing our dataset for the sequence-to-sequence task. For this example, let's assume we're working on a translation task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load and prepare a translation dataset\n", + "from datasets import load_dataset\n", + "dataset = load_dataset('translation_dataset_name')\n", + "\n", + "# Preprocessing function for the dataset\n", + "def preprocess_function(examples):\n", + " # Tokenize source and target texts\n", + " # Truncate or pad based on model's max length\n", + " return tokenizer(examples['source_language'], examples['target_language'], truncation=True, padding='max_length')\n", + "\n", + "# Apply the preprocessing\n", + "dataset = dataset.map(preprocess_function, batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting Up Training Parameters\n", + "\n", + "Next, we'll define the optimizer and training parameters for fine-tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AdamW\n", + "\n", + "# Optimizer\n", + "optimizer = AdamW(model.parameters(), lr=5e-5)\n", + "\n", + "# Training parameters\n", + "num_epochs = 3\n", + "batch_size = 8 # Adjust based on GPU memory" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Loop\n", + "\n", + "Now we'll implement the training loop, where the model will learn to translate the source text to the target text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm\n", + "\n", + "model.train()\n", + "for epoch in range(num_epochs):\n", + " progress_bar = tqdm(dataset['train'], desc=f'Epoch {epoch + 1}', leave=False)\n", + " for batch in progress_bar:\n", + " # Forward pass and loss calculation\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " # Backward pass\n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + " progress_bar.set_postfix({'loss': loss.item()})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the Fine-Tuned Model\n", + "\n", + "After fine-tuning, we should evaluate the model's performance on a validation or test set to ensure it has learned the task effectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "# Add evaluation code for the sequence-to-sequence task" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "You have now learned how to fine-tune `{{ model_class }}` for a sequence-to-sequence task. Experiment with different datasets and hyperparameters to optimize the model for your specific use case." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForSequenceClassification.jinja b/geniusrise_text/notebook/templates/AutoModelForSequenceClassification.jinja new file mode 100644 index 0000000..05b4ad9 --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForSequenceClassification.jinja @@ -0,0 +1,255 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Sequence Classification Tutorial\n", + "\n", + "Welcome to this in-depth tutorial on using the `{{ model_class }}` for sequence classification tasks, employing the Hugging Face Transformers library. In this notebook, you'll learn how to prepare your model and data for tasks like sentiment analysis, topic classification, and more." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "Firstly, let's set up our working environment by importing necessary libraries and configuring the device settings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing Transformers and torch libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n", + "\n", + "# Checking GPU availability\n", + "device = 'cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu'\n", + "print(f'Using device: {device}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model and Tokenizer Loading\n", + "\n", + "Let's proceed by loading the `{{ model_class }}` and its associated tokenizer, specifically the `{{ model_name }}` model for sequence classification." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}')\n", + "\n", + "# Configuring the model for the current device\n", + "model = model.to(device)\n", + "\n", + "# Model precision configuration\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Explanation of the configurations and their impact..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sequence Classification Demonstration\n", + "\n", + "Now, we will demonstrate how to apply the `{{ model_class }}` for sequence classification. We'll take an example sentence and classify its sentiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example sentence for classification\n", + "sentence = 'I love using transformers for natural language processing.'\n", + "inputs = tokenizer(sentence, return_tensors='pt')\n", + "inputs = inputs.to(device)\n", + "\n", + "# Performing sequence classification\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " predictions = torch.argmax(outputs.logits, dim=-1)\n", + "\n", + "# Processing and displaying the results\n", + "print(f'Sentence sentiment: {model.config.id2label[predictions.item()]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tips for Effective Sequence Classification\n", + "\n", + "This section provides insights and best practices for enhancing model performance, handling various data formats, and troubleshooting common challenges in sequence classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "You have now explored the basic functionalities of `{{ model_class }}` for sequence classification. We encourage you to delve deeper into the Transformers library for more advanced use cases and features." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-Tuning the Model for Sequence Classification\n", + "\n", + "In this section, we'll fine-tune `{{ model_class }}` for a specific sequence classification task. Fine-tuning involves training the model on a targeted dataset to improve its performance on that specific task. We'll use a dataset suitable for our classification task, such as sentiment analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preparing the Dataset\n", + "\n", + "Let's start by loading and preparing our dataset. For sentiment analysis, we can use a dataset like IMDb movie reviews." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load and prepare the IMDb dataset\n", + "from datasets import load_dataset\n", + "dataset = load_dataset('imdb')\n", + "\n", + "# Preprocessing function for the dataset\n", + "def preprocess_function(examples):\n", + " # Tokenize the text and truncate/pad it for uniformity\n", + " return tokenizer(examples['text'], truncation=True, padding='max_length')\n", + "\n", + "# Apply preprocessing\n", + "dataset = dataset.map(preprocess_function, batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configuring Training Parameters\n", + "\n", + "Next, we need to set up the training parameters, including the optimizer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AdamW\n", + "\n", + "# Optimizer\n", + "optimizer = AdamW(model.parameters(), lr=5e-5)\n", + "\n", + "# Training parameters\n", + "num_epochs = 3\n", + "batch_size = 16 # Adjust as per GPU capacity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Loop\n", + "\n", + "Now, we will create the training loop, where the model will learn to classify the sentiment of the text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm\n", + "\n", + "model.train()\n", + "for epoch in range(num_epochs):\n", + " progress_bar = tqdm(dataset['train'], desc=f'Epoch {epoch + 1}', leave=False)\n", + " for batch in progress_bar:\n", + " # Forward pass and loss calculation\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " # Backward pass\n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + " progress_bar.set_postfix({'loss': loss.item()})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the Fine-Tuned Model\n", + "\n", + "It's crucial to evaluate the model on a validation or test set to assess its performance after fine-tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "# Add code for evaluating the model on the validation/test set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "This tutorial covered how to fine-tune `{{ model_class }}` for a sequence classification task. Experiment with different datasets, hyperparameters, and training strategies to optimize the model for your specific requirements." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForTableQuestionAnswering.jinja b/geniusrise_text/notebook/templates/AutoModelForTableQuestionAnswering.jinja new file mode 100644 index 0000000..ef67513 --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForTableQuestionAnswering.jinja @@ -0,0 +1,258 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Table-based Question Answering Tutorial\n", + "\n", + "Welcome to this detailed tutorial on using `{{ model_class }}` for table-based question answering tasks, utilizing the Hugging Face Transformers library. This notebook will guide you through setting up the model, preparing tabular data, and using the model to extract answers from tables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "First, let's set up our environment by importing the necessary libraries and configuring our computation device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing Transformers and torch libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n", + "\n", + "# Checking for GPU availability\n", + "device = 'cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu'\n", + "print(f'Using device: {device}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model and Tokenizer Loading\n", + "\n", + "Let's load the `{{ model_class }}` and its corresponding tokenizer, specifically for table-based question answering using the `{{ model_name }}` model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}')\n", + "\n", + "# Configuring the model for the current device\n", + "model = model.to(device)\n", + "\n", + "# Model precision configuration\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Explanation of the configurations and their impact..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table-based Question Answering Demonstration\n", + "\n", + "In this section, we will demonstrate how to use the `{{ model_class }}` for answering questions based on tabular data. We will provide a sample table and a question, and the model will generate the answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example table and question\n", + "table_data = '...' # Add or load your table data here\n", + "question = 'What is the ...?' # Add your question here\n", + "\n", + "# Tokenizing and encoding the table and question\n", + "inputs = tokenizer(table=table_data, queries=question, return_tensors='pt')\n", + "inputs = inputs.to(device)\n", + "\n", + "# Performing table-based question answering\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " answer = tokenizer.decode(outputs[0])\n", + "\n", + "# Displaying the answer\n", + "print('Answer:\\n', answer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tips for Effective Table-based Question Answering\n", + "\n", + "This section includes tips for optimizing model performance, handling different types of tabular data, and troubleshooting common challenges in table-based question answering." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "You have now learned the basics of using `{{ model_class }}` for table-based question answering. Continue to explore the Transformers library for more advanced features and functionalities." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-Tuning the Model for Table-based Question Answering\n", + "\n", + "In this section, we'll fine-tune `{{ model_class }}` on a table-based question answering task. Fine-tuning is a process of training the pre-trained model on a specific dataset to adapt it for a particular task. We'll use a dataset suitable for table-based question answering." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preparing the Dataset\n", + "\n", + "First, let's load and preprocess a dataset that contains tables and associated questions. For this example, we can use a dataset like the SQuAD with table-based questions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load and preprocess a dataset\n", + "from datasets import load_dataset\n", + "dataset = load_dataset('squad_table')\n", + "\n", + "# Preprocessing function for the dataset\n", + "def preprocess_function(examples):\n", + " # Tokenize the table data and questions\n", + " return tokenizer(table=examples['table_data'], queries=examples['question'], truncation=True, padding='max_length')\n", + "\n", + "# Apply preprocessing\n", + "dataset = dataset.map(preprocess_function, batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configuring Training Parameters\n", + "\n", + "Next, we'll set up our optimizer and training parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AdamW\n", + "\n", + "# Optimizer\n", + "optimizer = AdamW(model.parameters(), lr=5e-5)\n", + "\n", + "# Training parameters\n", + "num_epochs = 3\n", + "batch_size = 8 # Adjust based on the GPU's capacity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Loop\n", + "\n", + "We will now define the training loop for fine-tuning the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm\n", + "\n", + "model.train()\n", + "for epoch in range(num_epochs):\n", + " progress_bar = tqdm(dataset['train'], desc=f'Epoch {epoch + 1}', leave=False)\n", + " for batch in progress_bar:\n", + " # Forward pass and loss calculation\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " # Backward pass\n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + " progress_bar.set_postfix({'loss': loss.item()})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the Fine-Tuned Model\n", + "\n", + "It's essential to evaluate the model on a validation set to ensure it has effectively learned to answer questions based on table data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "# Add code to evaluate the model's performance on the validation set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "This tutorial covered how to fine-tune `{{ model_class }}` for table-based question answering tasks. Experiment with different datasets and hyperparameters to optimize the model for your specific needs." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForTokenClassification.jinja b/geniusrise_text/notebook/templates/AutoModelForTokenClassification.jinja new file mode 100644 index 0000000..6a8206d --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForTokenClassification.jinja @@ -0,0 +1,258 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Token Classification Tutorial\n", + "\n", + "Welcome to this in-depth tutorial on using `{{ model_class }}` for token classification tasks, with the Hugging Face Transformers library. This notebook will guide you through the process of preparing your model and data for tasks such as named entity recognition, part-of-speech tagging, and more." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Setup\n", + "\n", + "Let's begin by setting up our environment, importing necessary libraries, and configuring our computation device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing Transformers and torch libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n", + "\n", + "# Checking GPU availability\n", + "device = 'cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu'\n", + "print(f'Using device: {device}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model and Tokenizer Loading\n", + "\n", + "Now, let's load the `{{ model_class }}` and its corresponding tokenizer, using the `{{ model_name }}` model for token classification tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}')\n", + "\n", + "# Configuring the model for the current device\n", + "model = model.to(device)\n", + "\n", + "# Model precision configuration\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Explanation of the configurations and their impact..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Token Classification Demonstration\n", + "\n", + "In this section, we'll demonstrate how to use `{{ model_class }}` for token classification. We'll take an example sentence and classify each token." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example sentence for token classification\n", + "sentence = 'Hugging Face is a technology company based in New York.'\n", + "inputs = tokenizer(sentence, return_tensors='pt')\n", + "inputs = inputs.to(device)\n", + "\n", + "# Performing token classification\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " predictions = torch.argmax(outputs.logits, dim=-1)\n", + "\n", + "# Processing and displaying the classification results\n", + "tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])\n", + "for token, prediction in zip(tokens, predictions[0]):\n", + " print(f'{token}: {model.config.id2label[prediction.item()]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tips for Effective Token Classification\n", + "\n", + "Here are some tips and best practices to enhance model performance, handle different types of data, and troubleshoot common issues in token classification tasks." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "You have now learned the fundamentals of using `{{ model_class }}` for token classification. Explore the Transformers library further to discover more advanced techniques and applications." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-Tuning the Model for Token Classification\n", + "\n", + "In this section, we'll fine-tune `{{ model_class }}` for a specific token classification task. Fine-tuning involves adapting the pre-trained model to a particular dataset to improve its performance on that specific task. We'll choose an appropriate dataset for our token classification task." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preparing the Dataset\n", + "\n", + "We'll start by preparing our dataset for the token classification task. For example, we can use a dataset for named entity recognition." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load and prepare a token classification dataset\n", + "from datasets import load_dataset\n", + "dataset = load_dataset('token_classification_dataset_name')\n", + "\n", + "# Preprocessing function for the dataset\n", + "def preprocess_function(examples):\n", + " # Tokenize the sentences and align the labels\n", + " # Note: Adjust the tokenization and label alignment as per your dataset\n", + " return tokenizer(examples['sentences'], truncation=True, padding='max_length')\n", + "\n", + "# Apply preprocessing\n", + "dataset = dataset.map(preprocess_function, batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configuring Training Parameters\n", + "\n", + "Next, let's set up the optimizer and other training parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AdamW\n", + "\n", + "# Optimizer\n", + "optimizer = AdamW(model.parameters(), lr=5e-5)\n", + "\n", + "# Training parameters\n", + "num_epochs = 3\n", + "batch_size = 16 # Adjust based on your GPU's capacity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Loop\n", + "\n", + "Now, let's implement the training loop for fine-tuning the model on our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm\n", + "\n", + "model.train()\n", + "for epoch in range(num_epochs):\n", + " progress_bar = tqdm(dataset['train'], desc=f'Epoch {epoch + 1}', leave=False)\n", + " for batch in progress_bar:\n", + " # Forward pass and loss calculation\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " # Backward pass\n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + " progress_bar.set_postfix({'loss': loss.item()})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the Fine-Tuned Model\n", + "\n", + "After fine-tuning, it's important to evaluate the model on a validation set to assess its performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "# Add evaluation code to assess the model's performance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "This tutorial covered the process of fine-tuning `{{ model_class }}` for a token classification task. Experiment with different datasets, hyperparameters, and training strategies to optimize the model for your specific requirements." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/operations/__init__.py b/geniusrise_text/operations/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/operations/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/operations/merge.py b/geniusrise_text/operations/merge.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/operations/merge.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/operations/quantization.py b/geniusrise_text/operations/quantization.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/operations/quantization.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/operations/torchscript.py b/geniusrise_text/operations/torchscript.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/operations/torchscript.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/qa/__init__.py b/geniusrise_text/qa/__init__.py new file mode 100644 index 0000000..1c634ee --- /dev/null +++ b/geniusrise_text/qa/__init__.py @@ -0,0 +1,19 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .api import QAAPI +from .bulk import QABulk +from .fine_tune import QAFineTuner diff --git a/geniusrise_text/qa/api.py b/geniusrise_text/qa/api.py new file mode 100644 index 0000000..94a675d --- /dev/null +++ b/geniusrise_text/qa/api.py @@ -0,0 +1,410 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List + +import cherrypy +import pandas as pd +import torch +from geniusrise import BatchInput, BatchOutput, State +from geniusrise.logging import setup_logger +from transformers import AutoModelForQuestionAnswering, AutoModelForTableQuestionAnswering, AutoTokenizer, pipeline + +from geniusrise_text.base import TextAPI + + +class QAAPI(TextAPI): + model: AutoModelForQuestionAnswering | AutoModelForTableQuestionAnswering + tokenizer: AutoTokenizer + + r""" + A class for handling different types of QA models, including traditional QA, TAPAS (Table-based QA), and TAPEX. + It utilizes the Hugging Face transformers library to provide state-of-the-art question answering capabilities + across various formats of data including plain text and tabular data. + + Attributes: + model (AutoModelForQuestionAnswering | AutoModelForTableQuestionAnswering): + The pre-trained QA model (traditional, TAPAS, or TAPEX). + tokenizer (AutoTokenizer): The tokenizer used to preprocess input text. + + Methods: + answer(self, **kwargs: Any) -> Dict[str, Any]: + Answers questions based on the provided context (text or table). + + CLI Usage Example: + ```bash + genius QAAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id distilbert-base-uncased-distilled-squad-lol \ + listen \ + --args \ + model_name="distilbert-base-uncased-distilled-squad" \ + model_class="AutoModelForQuestionAnswering" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + + ```bash + genius QAAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id google/tapas-base-finetuned-wtq-lol \ + listen \ + --args \ + model_name="google/tapas-base-finetuned-wtq" \ + model_class="AutoModelForTableQuestionAnswering" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + + ```bash + genius QAAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id microsoft/tapex-large-finetuned-wtq-lol \ + listen \ + --args \ + model_name="microsoft/tapex-large-finetuned-wtq" \ + model_class="AutoModelForSeq2SeqLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + """ + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs: Any, + ): + """ + Initializes the QAAPI with configurations for input, output, and state management. + + Args: + input (BatchInput): Configuration for the input data. + output (BatchOutput): Configuration for the output data. + state (State): State management for the API. + **kwargs (Any): Additional keyword arguments for extended functionality. + """ + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + self.hf_pipeline = None + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def answer(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Answers questions based on the provided context (text or table). It adapts to the model type (traditional, TAPAS, TAPEX) + and provides answers accordingly. + + Args: + **kwargs (Any): Arbitrary keyword arguments, typically containing the 'question' and 'data' (context or table). + + Returns: + Dict[str, Any]: A dictionary containing the question, context/table, and answer(s). + + Example CURL Request for Text-based QA: + ```bash + curl -X POST localhost:3000/api/v1/answer \ + -H "Content-Type: application/json" \ + -d '{"question": "What is the capital of France?", "data": "France is a country in Europe. Its capital is Paris."}' + ``` + + Example CURL Requests: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/answer \ + -H "Content-Type: application/json" \ + -d '{ + "data": "Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me.", + "question": "What is the common wisdom about RNNs?" + + }' | jq + ``` + + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/answer \ + -H "Content-Type: application/json" \ + -d '{ + "data": [ + {"Name": "Alice", "Age": "30"}, + {"Name": "Bob", "Age": "25"} + ], + "question": "what is their total age?" + } + ' | jq + ``` + + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/answer \ + -H "Content-Type: application/json" \ + -d '{ + "data": {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}, + "question": "how many movies does Leonardo Di Caprio have?" + } + ' | jq + ``` + """ + data = cherrypy.request.json + question = data.get("question") + + model_type = "traditional" + if "tapas" in self.model_name.lower(): + model_type = "tapas" + elif "tapex" in self.model_name.lower(): + model_type = "tapex" + + if model_type in ["tapas", "tapex"]: + table = data.get("data") + return { + "data": table, + "question": question, + "answer": self.answer_table_question(table, question, model_type), + } + else: + context = data.get("data") + return { + "data": context, + "question": question, + "answer": self.answer_text_question(context, question), + } + + def answer_table_question(self, data: Dict[str, Any], question: str, model_type: str) -> dict: + """ + Answers a question based on the provided table. + + Args: + data (Dict[str, Any]): The table data and other parameters. + question (str): The question to be answered. + model_type (str): The type of the model ('tapas' or 'tapex'). + + Returns: + str: The answer derived from the table. + """ + + table = pd.DataFrame.from_dict(data) + if model_type == "tapas": + inputs = self.tokenizer(table=table, queries=[question], padding="max_length", return_tensors="pt") + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + outputs = self.model(**inputs) + + # Decode the predicted tokens + if hasattr(outputs, "logits_aggregation") and outputs.logits_aggregation is not None: + ( + predicted_answer_coordinates, + predicted_aggregation_indices, + ) = self.tokenizer.convert_logits_to_predictions( + {k: v.cpu() for k, v in inputs.items()}, + outputs.logits.detach().cpu(), + outputs.logits_aggregation.detach().cpu(), + ) + else: + predicted_answer_coordinates = self.tokenizer.convert_logits_to_predictions( + {k: v.cpu() for k, v in inputs.items()}, + outputs.logits.detach().cpu(), + ) + predicted_aggregation_indices = None + + cell_answers = [self._convert_coordinates_to_answer(table, x) for x in predicted_answer_coordinates[0]] + if type(cell_answers[0]) is list: + cell_answers = [y for x in cell_answers for y in x] # type: ignore + + if predicted_aggregation_indices: + aggregation_answer = self._convert_aggregation_to_answer(predicted_aggregation_indices[0]) + else: + aggregation_answer = "NONE" + return { + "answers": cell_answers, + "aggregation": aggregation_answer, + } + + elif model_type == "tapex": + encoding = self.tokenizer(table, question, return_tensors="pt") + if next(self.model.parameters()).is_cuda: + encoding = {k: v.cuda() for k, v in encoding.items()} + + outputs = self.model.generate(**encoding) + answers = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) + return { + "answers": answers, + "aggregation": "NONE", + } + else: + raise ValueError("Unsupported model type for table-based QA.") + + def _convert_aggregation_to_answer(self, aggregation_index: int) -> str: + """ + Converts the aggregation index predicted by TAPAS into an aggregation operation. + + Args: + aggregation_index (int): The index of the aggregation operation. + + Returns: + str: The string representation of the aggregation operation. + """ + aggregation_operations = { + 0: "NONE", + 1: "SUM", + 2: "AVERAGE", + 3: "COUNT", + 4: "MIN", + 5: "MAX", + 6: "OR", + 7: "AND", + 8: "CONCAT", + 9: "FIRST", + 10: "LAST", + } + return aggregation_operations.get(aggregation_index, "NONE") + + def _convert_coordinates_to_answer(self, table: pd.DataFrame, coordinates: Any) -> List[str]: + """ + Converts the coordinates predicted by TAPAS into an answer string. + + Args: + table (pd.DataFrame): The table used for the QA. + coordinates (Any): The coordinates of the cells predicted as part of the answer. + + Returns: + List[str]: The answer strings. + """ + if type(coordinates) is tuple: + coordinates = [coordinates] + return [table.iat[coord] for coord in coordinates] + + def answer_text_question(self, context: str, question: str) -> dict: + inputs = self.tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt") + input_ids = inputs["input_ids"].tolist()[0] + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + outputs = self.model(**inputs) + answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits + + answer_start = int(torch.argmax(answer_start_scores)) + answer_end = int(torch.argmax(answer_end_scores) + 1) + + answer = self.tokenizer.convert_tokens_to_string( + self.tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]) + ) + + return { + "answers": [answer], + "aggregation": "NONE", + } + + def initialize_pipeline(self): + """ + Lazy initialization of the QA Hugging Face pipeline. + """ + self.model_type = "traditional" # Default model type + + if "tapas" in self.model_name.lower(): + self.model_type = "tapas" + elif "tapex" in self.model_name.lower(): + self.model_type = "tapex" + + if not self.hf_pipeline: + if self.model_type == "tapas" or self.model_type == "tapex": + model = AutoModelForTableQuestionAnswering.from_pretrained(self.model_name) + else: + model = AutoModelForQuestionAnswering.from_pretrained(self.model_name) + + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + if self.model_type == "tapas" or self.model_type == "tapex": + self.hf_pipeline = pipeline("table-question-answering", model=model, tokenizer=tokenizer) + else: + self.hf_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def answer_pipeline(self, **kwargs: Any) -> Dict[str, Any]: + """ + Answers questions using the Hugging Face pipeline based on the provided context. + + Args: + **kwargs (Any): Arbitrary keyword arguments, typically containing 'question' and 'data'. + + Returns: + Dict[str, Any]: A dictionary containing the question, context, and the answer. + + Example CURL Request for QA: + ```bash + curl -X POST localhost:3000/api/v1/answer_pipeline \ + -H "Content-Type: application/json" \ + -d '{"question": "Who is the CEO of Tesla?", "data": "Elon Musk is the CEO of Tesla."}' + ``` + """ + self.initialize_pipeline() # Initialize the pipeline on first API hit + + data = cherrypy.request.json + question = data.get("question") + + if self.model_type in ["tapas", "tapex"]: + table = data.get("data") + result = self.hf_pipeline(table=table, query=question) # type: ignore + else: + context = data.get("data") + result = self.hf_pipeline(question=question, context=context) # type: ignore + + return {"question": question, "data": data.get("data"), "answer": result} diff --git a/geniusrise_text/qa/api.yml b/geniusrise_text/qa/api.yml new file mode 100644 index 0000000..4c1efbd --- /dev/null +++ b/geniusrise_text/qa/api.yml @@ -0,0 +1,122 @@ +openapi: 3.0.0 +info: + title: Question Answering API + description: API for performing various question answering tasks using pre-trained models, including text-based and table-based QA. + version: "1.0" +servers: + - url: http://localhost:3000/api/v1 + description: API server for question answering model processing +paths: + /answer: + post: + summary: Answers questions based on provided context or table + operationId: answerQuestion + tags: + - Question Answering + requestBody: + required: true + content: + application/json: + schema: + oneOf: + - title: Text-based QA + type: object + properties: + question: + type: string + description: The question to be answered based on the context. + data: + type: string + description: The textual context to answer the question from. + required: + - question + - data + - title: Table-based QA + type: object + properties: + question: + type: string + description: The question to be answered based on the table. + data: + type: array + items: + type: object + description: The tabular data to answer the question from, each object represents a row. + required: + - question + - data + responses: + 200: + description: Successful response with the answer + content: + application/json: + schema: + type: object + properties: + question: + type: string + description: The question that was asked. + data: + type: object + description: The context or table provided for the question. + answer: + type: object + properties: + answers: + type: array + items: + type: string + description: The answers derived from the context or table. + aggregation: + type: string + description: The aggregation operation applied for table-based QA, if any. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failure + /answer_pipeline: + post: + summary: Answers questions using the Hugging Face pipeline + operationId: answerQuestionPipeline + tags: + - Question Answering + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + question: + type: string + description: The question to be answered. + data: + type: string + description: The textual context to answer the question from. + required: + - question + - data + responses: + 200: + description: Successful response with the answer + content: + application/json: + schema: + type: object + properties: + question: + type: string + description: The question that was asked. + data: + type: string + description: The context provided for the question. + answer: + type: object + properties: + answer: + type: string + description: The answer derived from the context. + 400: + description: Bad request, e.g., missing required fields + 500: + description: Internal server error, e.g., model failure diff --git a/geniusrise_text/qa/bulk.py b/geniusrise_text/qa/bulk.py new file mode 100644 index 0000000..af19f38 --- /dev/null +++ b/geniusrise_text/qa/bulk.py @@ -0,0 +1,538 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional + +import pandas as pd +import torch +import yaml # type: ignore +from datasets import Dataset, load_from_disk +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.base import TextBulk + + +class QABulk(TextBulk): + r""" + QABulk is a class designed for managing bulk question-answering tasks using Hugging Face models. It is + capable of handling both traditional text-based QA and table-based QA (using TAPAS and TAPEX models), + providing a versatile solution for automated question answering at scale. + + Args: + input (BatchInput): Configuration and data inputs for batch processing. + output (BatchOutput): Configurations for output data handling. + state (State): State management for the bulk QA task. + **kwargs: Arbitrary keyword arguments for extended functionality. + + Example CLI Usage: + ```bash + # For traditional text-based QA: + genius QABulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/qa-traditional \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/qa-traditional \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id distilbert-base-uncased-distilled-squad-lol \ + answer_questions \ + --args \ + model_name="distilbert-base-uncased-distilled-squad" \ + model_class="AutoModelForQuestionAnswering" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="bfloat16" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False + + # For table-based QA using TAPAS: + genius QABulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/qa-table \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/qa-table \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id google/tapas-base-finetuned-wtq-lol \ + answer_questions \ + --args \ + model_name="google/tapas-base-finetuned-wtq" \ + model_class="AutoModelForTableQuestionAnswering" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False + + # For table-based QA using TAPEX: + genius QABulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/qa-table \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/qa-table \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id microsoft/tapex-large-finetuned-wtq-lol \ + answer_questions \ + --args \ + model_name="microsoft/tapex-large-finetuned-wtq" \ + model_class="AutoModelForSeq2SeqLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + """ + Initializes the QABulk class with configurations for input, output, and state. + + Args: + input (BatchInput): Configuration for the input data. + output (BatchOutput): Configuration for the output data. + state (State): State management for the QA task. + **kwargs (Any): Additional keyword arguments for extended functionality. + """ + super().__init__(input, output, state, **kwargs) + + def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs) -> Optional[Dataset]: + r""" + Load a dataset from a directory. + + ## Supported Data Formats and Structures: + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"context": "The context content", "question": "The question"} + ``` + + ### CSV + Should contain 'context' and 'question' columns. + ```csv + context,question + "The context content","The question" + ``` + + ### Parquet + Should contain 'context' and 'question' columns. + + ### JSON + An array of dictionaries with 'context' and 'question' keys. + ```json + [{"context": "The context content", "question": "The question"}] + ``` + + ### XML + Each 'record' element should contain 'context' and 'question' elements. + ```xml + + The context content + The question + + ``` + + ### YAML + Each document should be a dictionary with 'context' and 'question' keys. + ```yaml + - context: "The context content" + question: "The question" + ``` + + ### TSV + Should contain 'context' and 'question' columns separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'context' and 'question' columns. + + ### SQLite (.db) + Should contain a table with 'context' and 'question' columns. + + ### Feather + Should contain 'context' and 'question' columns. + + Args: + dataset_path (str): The path to the dataset directory. + pad_on_right (bool): Whether to pad on the right. + max_length (int): The maximum length of the sequences. + doc_stride (int): The document stride. + evaluate_squadv2 (bool): Whether to evaluate using SQuAD v2 metrics. + + Returns: + Dataset: The loaded dataset. + """ + + self.max_length = max_length + + try: + self.log.info(f"Loading dataset from {dataset_path}") + if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + return load_from_disk(dataset_path) + else: + data = [] + for filename in glob.glob(f"{dataset_path}/**/*", recursive=True): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + context = record.find("data").text # type: ignore + question = record.find("question").text # type: ignore + data.append({"data": context, "question": question}) + + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT data, question FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + return Dataset.from_pandas(pd.DataFrame(data)) + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def answer_questions( + self, + model_name: str, + model_class: str = "AutoModelForQuestionAnswering", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + batch_size: int = 32, + notification_email: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Perform bulk question-answering using the specified model and tokenizer. This method can handle various types + of QA models including traditional, TAPAS, and TAPEX. + + Args: + model_name (str): Name or path of the question-answering model. + model_class (str, optional): Class name of the model (e.g., "AutoModelForQuestionAnswering"). + tokenizer_class (str, optional): Class name of the tokenizer (e.g., "AutoTokenizer"). + use_cuda (bool, optional): Whether to use CUDA for model inference. Defaults to False. + precision (str, optional): Precision for model computation. Defaults to "float16". + quantization (int, optional): Level of quantization for optimizing model size and speed. Defaults to 0. + device_map (str | Dict | None, optional): Specific device to use for computation. Defaults to "auto". + max_memory (Dict, optional): Maximum memory configuration for devices. Defaults to {0: "24GB"}. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool, optional): Whether to enable AWQ optimization. Defaults to False. + flash_attention (bool, optional): Whether to use flash attention optimization. Defaults to False. + batch_size (int, optional): Number of questions to process simultaneously. Defaults to 32. + **kwargs: Arbitrary keyword arguments for model and generation configurations. + + Processing: + The method processes the data in batches, utilizing the appropriate model based on the model name + and generating answers for the questions provided in the dataset. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.batch_size = batch_size + self.notification_email = notification_email + self.compile = compile + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + dataset = self.load_dataset(dataset_path) + if dataset is None: + self.log.error("Failed to load dataset.") + return + + model_type = "traditional" + if "tapas" in self.model_name.lower(): + model_type = "tapas" + elif "tapex" in self.model_name.lower(): + model_type = "tapex" + + output_data = [] + for batch in range(0, len(dataset), self.batch_size): + batch_data = dataset[batch : batch + self.batch_size] + + if model_type == "traditional": + questions = batch_data["question"] + contexts = batch_data["data"] + + inputs = self.tokenizer( + questions, + contexts, + add_special_tokens=True, + return_tensors="pt", + truncation="only_second", + max_length=self.max_length, + ) + + # Move inputs to GPU if CUDA is available + if self.use_cuda and torch.cuda.is_available(): + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + outputs = self.model(**inputs) + + answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits + answer_start = torch.argmax(answer_start_scores, dim=1) + answer_end = torch.argmax(answer_end_scores, dim=1) + 1 + + for i in range(outputs.start_logits.shape[0]): + answer = self.tokenizer.convert_tokens_to_string( + self.tokenizer.convert_ids_to_tokens( + inputs["input_ids"][i][int(answer_start[i]) : int(answer_end[i])] + ) + ) + output_data.append( + { + "data": contexts[i], + "question": questions[i], + "answer": answer, + } + ) + elif model_type == "tapas": + questions = batch_data["question"] + tables = [pd.DataFrame.from_dict(json.loads(x)) for x in batch_data["data"]] + + for table, question in zip(tables, questions): + inputs = self.tokenizer(table=table, queries=[question], padding="max_length", return_tensors="pt") + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + outputs = self.model(**inputs) + + # Decode the predicted tokens + if hasattr(outputs, "logits_aggregation") and outputs.logits_aggregation is not None: + ( + predicted_answer_coordinates, + predicted_aggregation_indices, + ) = self.tokenizer.convert_logits_to_predictions( + {k: v.cpu() for k, v in inputs.items()}, + outputs.logits.detach().cpu(), + outputs.logits_aggregation.detach().cpu(), + ) + else: + predicted_answer_coordinates = self.tokenizer.convert_logits_to_predictions( + {k: v.cpu() for k, v in inputs.items()}, + outputs.logits.detach().cpu(), + ) + predicted_aggregation_indices = None + + cell_answers = [ + self._convert_coordinates_to_answer(table, x) for x in predicted_answer_coordinates[0] + ] + if type(cell_answers[0]) is list: + cell_answers = [y for x in cell_answers for y in x] # type: ignore + + if predicted_aggregation_indices: + aggregation_answer = self._convert_aggregation_to_answer(predicted_aggregation_indices[0]) + else: + aggregation_answer = "NONE" + output_data.append( + { + "data": table.to_dict("records"), + "question": question, + "answers": cell_answers, + "aggregation": aggregation_answer, + } + ) + + elif model_type == "tapex": + questions = batch_data["question"] + tables = [pd.DataFrame.from_dict(json.loads(x)) for x in batch_data["data"]] + + for table, question in zip(tables, questions): + encoding = self.tokenizer(table, question, return_tensors="pt") + if next(self.model.parameters()).is_cuda: + encoding = {k: v.cuda() for k, v in encoding.items()} + + outputs = self.model.generate(**encoding) + answers = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) + output_data.append( + { + "data": table.to_dict("records"), + "question": question, + "answers": answers, + "aggregation": "NONE", + } + ) + + # Save the results + output_file = os.path.join(output_path, f"qa_results-{str(uuid.uuid4())}.json") + with open(output_file, "w") as file: + json.dump(output_data, file) + self.done() + self.log.info(f"Results saved to {output_file}") + + def _convert_aggregation_to_answer(self, aggregation_index: int) -> str: + """ + Converts the aggregation index predicted by TAPAS into an aggregation operation. + + Args: + aggregation_index (int): The index of the aggregation operation. + + Returns: + str: The string representation of the aggregation operation. + """ + aggregation_operations = { + 0: "NONE", + 1: "SUM", + 2: "AVERAGE", + 3: "COUNT", + 4: "MIN", + 5: "MAX", + 6: "OR", + 7: "AND", + 8: "CONCAT", + 9: "FIRST", + 10: "LAST", + } + return aggregation_operations.get(aggregation_index, "NONE") + + def _convert_coordinates_to_answer(self, table: pd.DataFrame, coordinates: Any) -> List[str]: + """ + Converts the coordinates predicted by TAPAS into an answer string. + + Args: + table (pd.DataFrame): The table used for the QA. + coordinates (Any): The coordinates of the cells predicted as part of the answer. + + Returns: + List[str]: The answer strings. + """ + if type(coordinates) is tuple: + coordinates = [coordinates] + return [table.iat[coord] for coord in coordinates] diff --git a/huggingface/question_answering.py b/geniusrise_text/qa/fine_tune.py similarity index 82% rename from huggingface/question_answering.py rename to geniusrise_text/qa/fine_tune.py index 53e0060..8694a57 100644 --- a/huggingface/question_answering.py +++ b/geniusrise_text/qa/fine_tune.py @@ -1,18 +1,17 @@ # 🧠 Geniusrise # Copyright (C) 2023 geniusrise.ai # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import ast import json @@ -25,17 +24,17 @@ import numpy as np import pandas as pd import yaml # type: ignore -from datasets import Dataset, load_from_disk +from datasets import Dataset, load_dataset, load_from_disk from geniusrise.core import BatchInput, BatchOutput, State from pyarrow import feather from pyarrow import parquet as pq from sklearn.metrics import accuracy_score from transformers import EvalPrediction -from .base import HuggingFaceFineTuner +from geniusrise_text.base import TextFineTuner -class HuggingFaceQuestionAnsweringFineTuner(HuggingFaceFineTuner): +class QAFineTuner(TextFineTuner): r""" A bolt for fine-tuning Hugging Face models on question answering tasks. @@ -44,62 +43,22 @@ class HuggingFaceQuestionAnsweringFineTuner(HuggingFaceFineTuner): output (OutputConfig): The output data. state (State): The state manager. - ## Using geniusrise to invoke via command line - ```bash - genius HuggingFaceQuestionAnsweringFineTuner rise \ - batch \ - --input_bucket my_bucket \ - --input_folder my_folder \ - batch \ - --output_bucket my_output_bucket \ - --output_folder my_output_folder \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - load_dataset \ - --args dataset_path=my_dataset_path max_length=512 - ``` + CLI Usage: - ## Using geniusrise to invoke via YAML file - ```yaml - version: "1" - bolts: - my_huggingface_bolt: - name: "HuggingFaceQuestionAnsweringFineTuner" - method: "load_dataset" - args: - dataset_path: "my_dataset_path" - max_length: 512 - input: - type: "batch" - args: - bucket: "my_bucket" - folder: "my_folder" - output: - type: "batch" - args: - bucket: "my_output_bucket" - folder: "my_output_folder" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "geniusrise" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_huggingface_bolt" - namespace: "default" - image: "my_huggingface_bolt_image" - replicas: 1 + ```bash + genius QAFineTuner rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id microsoft/tapex-large-finetuned-wtq-lol \ + fine_tune \ + --args \ + model_name=my_model \ + tokenizer_name=my_tokenizer \ + num_train_epochs=3 \ + per_device_train_batch_size=8 ``` """ @@ -130,7 +89,7 @@ def __init__( input=input, output=output, state=state, - eval=eval, + evaluate=evaluate, **kwargs, ) @@ -221,7 +180,9 @@ def load_dataset( # Load the dataset from the directory try: - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): dataset = load_from_disk(dataset_path) else: data = [] @@ -274,11 +235,18 @@ def load_dataset( elif filename.endswith(".feather"): df = feather.read_feather(filepath) data.extend(df.to_dict("records")) + dataset = Dataset.from_pandas(pd.DataFrame(data)) except Exception as e: self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") return None + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + # Preprocess the dataset try: tokenized_dataset = dataset.map( @@ -293,7 +261,7 @@ def load_dataset( return tokenized_dataset def prepare_train_features( - self, examples: Dict[str, Union[str, List[str]]] + self, examples: Dict[str, Union[str, List[str]]], cls_token_id: Optional[int] = None ) -> Optional[Dict[str, Union[List[int], List[List[int]]]]]: """ Tokenize our examples with truncation and padding, but keep the overflows using a stride. @@ -311,6 +279,9 @@ def prepare_train_features( ast.literal_eval(example) if type(example) is str else example for example in examples["answers"] ] + if not cls_token_id and not self.tokenizer.cls_token_id: + self.tokenizer.cls_token_id = self.tokenizer.eos_token_id + # Tokenize the examples try: tokenized_examples = self.tokenizer( @@ -335,7 +306,7 @@ def prepare_train_features( # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") - # Let's label those examples! + # Let's label those examples tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] diff --git a/geniusrise_text/qa/tests/__init__.py b/geniusrise_text/qa/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/qa/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/qa/tests/test_fine_tune.py b/geniusrise_text/qa/tests/test_fine_tune.py new file mode 100644 index 0000000..03b934d --- /dev/null +++ b/geniusrise_text/qa/tests/test_fine_tune.py @@ -0,0 +1,281 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import numpy as np +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq +from transformers import EvalPrediction + +from geniusrise_text.qa.fine_tune import QAFineTuner + +lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", +} + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [ + { + "context": f"context_{i}", + "question": f"question_{i}", + "answers": [{"answer_start": [0], "text": [f"answer_{i}"]}], + } + for i in range(10) + ] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "context").text = item["context"] + ET.SubElement(record, "question").text = item["question"] + ET.SubElement(record, "answers").text = str(item["answers"]) + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df["answers"] = df["answers"].apply(str) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + # "db", + # "xml", + # "jsonl", + # "parquet", + # "json", + # "yaml", + # "tsv", + # "xlsx", + # "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir + "/train", ext) + create_dataset_in_format(tmpdir + "/eval", ext) + return tmpdir, ext + + +MODELS_TO_TEST = { + # fmt: off + "small": "facebook/bart-base", + # fmt: on +} + + +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + +@pytest.fixture +def question_answering_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + klass = QAFineTuner( + input=input, + output=output, + state=state, + ) + return klass + + +# def test_question_answering_bolt_init(question_answering_bolt, model): +# name, model_name = model +# tokenizer_name = model_name +# model_class = "AutoModelForQuestionAnswering" +# tokenizer_class = "AutoTokenizer" + +# question_answering_bolt.load_models( +# model_name=model_name, +# tokenizer_name=tokenizer_name, +# model_class=model_class, +# tokenizer_class=tokenizer_class, +# device_map=None, +# ) + +# assert question_answering_bolt.model is not None +# assert question_answering_bolt.tokenizer is not None +# assert question_answering_bolt.input is not None +# assert question_answering_bolt.output is not None +# assert question_answering_bolt.state is not None + + +# def test_load_dataset_all_formats(question_answering_bolt, dataset_file, model): +# tmpdir, ext = dataset_file +# dataset_path = os.path.join(tmpdir, "train") + +# name, model_name = model +# tokenizer_name = model_name +# model_class = "AutoModelForQuestionAnswering" +# tokenizer_class = "AutoTokenizer" + +# question_answering_bolt.load_models( +# model_name=model_name, +# tokenizer_name=tokenizer_name, +# model_class=model_class, +# tokenizer_class=tokenizer_class, +# device_map=None, +# ) + +# dataset = question_answering_bolt.load_dataset(dataset_path) +# assert dataset is not None +# assert len(dataset) == 10 + + +# Models to test +models = { + # fmt: off + "bart": "facebook/bart-large-cnn", + "distillbart": "sshleifer/distilbart-cnn-12-6", + "flan": "google/flan-t5-xl", + "t5": "google/t5-v1_1-xl", + "mt5": "google/umt5-xl", + "pegasus": "google/pegasus-x-large", + "pegasus-news": "google/pegasus-multi_news", + "bigbird-arxiv": "google/bigbird-pegasus-large-arxiv", + "bigbird-bigpatent": "google/bigbird-pegasus-large-bigpatent", + "bigbird-pubmed": "google/bigbird-pegasus-large-pubmed", + # fmt: on +} + + +# Test for fine-tuning +@pytest.mark.parametrize( + "model_name, precision, quantization, lora_config, use_accelerate", + [ + # small + (models["bart"], "bfloat16", None, None, False), + (models["distillbart"], "bfloat16", None, None, False), + (models["flan"], "bfloat16", 8, lora_config, False), + (models["t5"], "bfloat16", 8, lora_config, False), + (models["mt5"], "bfloat16", 8, lora_config, False), + (models["pegasus"], "bfloat16", 8, lora_config, False), + (models["pegasus-news"], "bfloat16", None, None, False), + (models["bigbird-arxiv"], "bfloat16", None, None, False), + (models["bigbird-bigpatent"], "bfloat16", None, None, False), + (models["bigbird-pubmed"], "bfloat16", None, None, False), + ], +) +def test_question_answering_bolt_fine_tune( + question_answering_bolt, dataset_file, model_name, precision, quantization, lora_config, use_accelerate +): + try: + tokenizer_name = model_name + + tmpdir, ext = dataset_file + question_answering_bolt.input.input_folder = tmpdir + + question_answering_bolt.fine_tune( + model_name=model_name, + tokenizer_name=model_name, + model_class="AutoModelForQuestionAnswering", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map=None, + ) + output_dir = question_answering_bolt.output.output_folder + assert os.path.exists( + os.path.join(question_answering_bolt.output.output_folder, "model", "pytorch_model.bin") + ) or os.path.exists(os.path.join(question_answering_bolt.output.output_folder, "model", "adapter_model.bin")) + assert os.path.exists( + os.path.join(question_answering_bolt.output.output_folder, "model", "config.json") + ) or os.path.exists(os.path.join(question_answering_bolt.output.output_folder, "model", "adapter_config.json")) + assert os.path.exists(os.path.join(question_answering_bolt.output.output_folder, "model", "training_args.bin")) + + del question_answering_bolt.model + del question_answering_bolt.tokenizer + torch.cuda.empty_cache() + + try: + os.remove(os.path.join(question_answering_bolt.output.output_folder, "model", "pytorch_model.bin")) + os.remove(os.path.join(question_answering_bolt.output.output_folder, "model", "adapter_model.bin")) + os.remove(os.path.join(question_answering_bolt.output.output_folder, "model", "config.json")) + os.remove(os.path.join(question_answering_bolt.output.output_folder, "model", "adapter_config.json")) + os.remove(os.path.join(question_answering_bolt.output.output_folder, "model", "training_args.bin")) + except Exception as _: + pass + + except Exception as e: + del question_answering_bolt.model + del question_answering_bolt.tokenizer + torch.cuda.empty_cache() + raise + + +# Test for computing metrics +def test_question_answering_bolt_compute_metrics(question_answering_bolt): + logits = np.array([[0.6, 0.4], [0.4, 0.6]]) + labels = np.array([0, 1]) + eval_pred = EvalPrediction(predictions=logits, label_ids=labels) + metrics = question_answering_bolt.compute_metrics(eval_pred) + assert "accuracy" in metrics diff --git a/geniusrise_text/summarization/__init__.py b/geniusrise_text/summarization/__init__.py new file mode 100644 index 0000000..a104cef --- /dev/null +++ b/geniusrise_text/summarization/__init__.py @@ -0,0 +1,18 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import SummarizationAPI +from .bulk import SummarizationBulk +from .fine_tune import SummarizationFineTuner diff --git a/geniusrise_text/summarization/api.py b/geniusrise_text/summarization/api.py new file mode 100644 index 0000000..df3964d --- /dev/null +++ b/geniusrise_text/summarization/api.py @@ -0,0 +1,190 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Any, Dict + +import cherrypy +from geniusrise import BatchInput, BatchOutput, State +from geniusrise.logging import setup_logger +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline + +from geniusrise_text.base import TextAPI + +log = logging.getLogger(__name__) + + +class SummarizationAPI(TextAPI): + r""" + A class for serving a Hugging Face-based summarization model. This API provides an interface to + submit text and receive a summarized version, utilizing state-of-the-art machine learning models for + text summarization. + + Attributes: + model (AutoModelForSeq2SeqLM): The loaded Hugging Face model for summarization. + tokenizer (AutoTokenizer): The tokenizer for preprocessing text. + + Methods: + summarize(self, **kwargs: Any) -> Dict[str, Any]: + Summarizes the input text based on the given parameters. + + CLI Usage: + ```bash + genius SummarizationAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id facebook/bart-large-cnn-lol \ + listen \ + --args \ + model_name="facebook/bart-large-cnn" \ + model_class="AutoModelForSeq2SeqLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + """ + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs: Any, + ) -> None: + """ + Initializes the SummarizationAPI class with input, output, and state configurations. + + Args: + input (BatchInput): Configuration for input data. + output (BatchOutput): Configuration for output data. + state (State): State management for API. + **kwargs (Any): Additional keyword arguments for extended functionality. + """ + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + self.pipeline = None + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def summarize(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Summarizes the input text based on the given parameters using a machine learning model. The method + accepts parameters via a POST request and returns the summarized text. + + Args: + **kwargs (Any): Arbitrary keyword arguments. Expected to receive these from the POST request's JSON body. + + Returns: + Dict[str, Any]: A dictionary containing the input text and its summary. + + Example CURL Requests: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/summarize \ + -H "Content-Type: application/json" \ + -d '{ + "text": "Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me.", + "decoding_strategy": "generate", + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 0, + "forced_eos_token_id": 2, + "length_penalty": 2.0, + "max_length": 142, + "min_length": 56, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "pad_token_id": 1, + "do_sample": false + }' | jq + ``` + + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/summarize \ + -H "Content-Type: application/json" \ + -d '{ + "text": "Theres something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for Image Captioning. Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience Ive in fact reached the opposite conclusion). Fast forward about a year: Im training RNNs all the time and Ive witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me.", + "decoding_strategy": "generate", + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 142, + "min_length": 56, + "no_repeat_ngram_size": 3, + "num_beams": 4 + }' | jq + ``` + """ + data = cherrypy.request.json + text = data.get("text") + decoding_strategy = data.get("decoding_strategy", "generate") + + generation_params = data + if "decoding_strategy" in generation_params: + del generation_params["decoding_strategy"] + + summary = self.generate(prompt=text, decoding_strategy=decoding_strategy, **generation_params) + + return {"input": text, "summary": summary} + + def initialize_pipeline(self): + """ + Lazy initialization of the summarization Hugging Face pipeline. + """ + if not self.hf_pipeline: + model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.hf_pipeline = pipeline("summarization", model=model, tokenizer=tokenizer) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def summarize_pipeline(self, **kwargs: Any) -> Dict[str, Any]: + """ + Summarizes the input text using the Hugging Face pipeline based on given parameters. + + Args: + **kwargs: Keyword arguments containing parameters for summarization. + + Returns: + A dictionary containing the input text and its summary. + + Example CURL Request for summarization: + `curl -X POST localhost:3000/api/v1/summarize_pipeline -H "Content-Type: application/json" -d '{"text": "Your long text here"}'` + """ + self.initialize_pipeline() # Initialize the pipeline on first API hit + + data = cherrypy.request.json + text = data.get("text") + generation_params = {k: v for k, v in data.items() if k != "text"} + + result = self.hf_pipeline(text, **generation_params) # type: ignore + + return {"input": text, "summary": result} diff --git a/geniusrise_text/summarization/api.yml b/geniusrise_text/summarization/api.yml new file mode 100644 index 0000000..3b9d647 --- /dev/null +++ b/geniusrise_text/summarization/api.yml @@ -0,0 +1,98 @@ +openapi: 3.0.0 +info: + title: Text Processing API + description: Provides APIs for text summarization and general text processing using Hugging Face models. + version: "1.0" +servers: + - url: http://localhost:3000/api/v1 + description: API server for text processing +paths: + /summarize: + post: + summary: Summarizes the provided text + operationId: summarizeText + tags: + - Summarization + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The text to be summarized. + decoding_strategy: + type: string + description: The strategy to use for decoding, defaults to 'generate'. + default: "generate" + max_length: + type: integer + description: The maximum length of the summary. + min_length: + type: integer + description: The minimum length of the summary. + num_beams: + type: integer + description: Number of beams for beam search. + required: + - text + responses: + 200: + description: Text successfully summarized + content: + application/json: + schema: + type: object + properties: + input: + type: string + description: The original input text. + summary: + type: string + description: The summarized text. + 400: + description: Invalid input provided + 500: + description: Error in processing the request + + /summarize_pipeline: + post: + summary: Summarizes provided text using the Hugging Face summarization pipeline + operationId: summarizeTextPipeline + tags: + - Summarization Pipeline + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The text to be summarized. + otherParams: + type: object + description: Additional parameters for the pipeline, specified as key-value pairs. + required: + - text + responses: + 200: + description: Text successfully summarized using the pipeline + content: + application/json: + schema: + type: object + properties: + input: + type: string + description: The original input text. + summary: + type: string + description: The summarized text. + 400: + description: Invalid input provided + 500: + description: Error in processing the request diff --git a/geniusrise_text/summarization/bulk.py b/geniusrise_text/summarization/bulk.py new file mode 100644 index 0000000..7fbea6a --- /dev/null +++ b/geniusrise_text/summarization/bulk.py @@ -0,0 +1,358 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional + +import pandas as pd +import yaml # type: ignore +from datasets import Dataset, load_from_disk +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.base import TextBulk + + +class SummarizationBulk(TextBulk): + r""" + SummarizationBulk is a class for managing bulk text summarization tasks using Hugging Face models. It is + designed to handle large-scale summarization tasks efficiently and effectively, utilizing state-of-the-art + machine learning models to provide high-quality summaries. + + The class provides methods to load datasets, configure summarization models, and execute bulk summarization tasks. + + Example CLI Usage: + ```bash + genius SummarizationBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/summz \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/summz \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id facebook/bart-large-cnn-lol \ + summarize \ + --args \ + model_name="facebook/bart-large-cnn" \ + model_class="AutoModelForSeq2SeqLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + generation_bos_token_id=0 \ + generation_decoder_start_token_id=2 \ + generation_early_stopping=true \ + generation_eos_token_id=2 \ + generation_forced_bos_token_id=0 \ + generation_forced_eos_token_id=2 \ + generation_length_penalty=2.0 \ + generation_max_length=142 \ + generation_min_length=56 \ + generation_no_repeat_ngram_size=3 \ + generation_num_beams=4 \ + generation_pad_token_id=1 \ + generation_do_sample=false + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + """ + Initializes the SummarizationBulk class. + + Args: + input (BatchInput): The input data configuration. + output (BatchOutput): The output data configuration. + state (State): The state configuration. + **kwargs: Additional keyword arguments. + """ + super().__init__(input, output, state, **kwargs) + + def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs) -> Optional[Dataset]: + r""" + Load a dataset from a directory. + + Args: + dataset_path (str): The path to the dataset directory. + **kwargs: Additional keyword arguments. + + Returns: + Dataset | DatasetDict: The loaded dataset. + + ## Supported Data Formats and Structures: + + ### JSONL + Each line is a JSON object representing an example. + ```json + {"text": "The text content"} + ``` + + ### CSV + Should contain a 'text' column. + ```csv + text + "The text content" + ``` + + ### Parquet + Should contain a 'text' column. + + ### JSON + An array of dictionaries with a 'text' key. + ```json + [{"text": "The text content"}] + ``` + + ### XML + Each 'record' element should contain 'text' child element. + ```xml + + The text content + + ``` + + ### YAML + Each document should be a dictionary with a 'text' key. + ```yaml + - text: "The text content" + ``` + + ### TSV + Should contain a 'text' column. + + ### Excel (.xls, .xlsx) + Should contain a 'text' column. + + ### SQLite (.db) + Should contain a table with a 'text' column. + + ### Feather + Should contain a 'text' column. + """ + try: + if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + dataset = load_from_disk(dataset_path) + else: + data = [] + for filename in glob.glob(f"{dataset_path}/**/*", recursive=True): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + text = record.find("text").text # type: ignore + data.append({"text": text}) + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = "SELECT text FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + data = [fn(d) for d in data] + else: + data = data + + dataset = Dataset.from_pandas(pd.DataFrame(data)) + return dataset + + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def summarize( + self, + model_name: str, + model_class: str = "AutoModelForSeq2SeqLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + batch_size: int = 32, + max_length: int = 512, + notification_email: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Perform bulk summarization using the specified model and tokenizer. This method handles the entire summarization + process including loading the model, processing input data, generating summarization, and saving the results. + + Args: + model_name (str): Name or path of the translation model. + origin (str): Source language ISO code. + target (str): Target language ISO code. + max_length (int): Maximum length of the tokens (default 512). + model_class (str): Class name of the model (default "AutoModelForSeq2SeqLM"). + tokenizer_class (str): Class name of the tokenizer (default "AutoTokenizer"). + use_cuda (bool): Whether to use CUDA for model inference (default False). + precision (str): Precision for model computation (default "float16"). + quantization (int): Level of quantization for optimizing model size and speed (default 0). + device_map (str | Dict | None): Specific device to use for computation (default "auto"). + max_memory (Dict): Maximum memory configuration for devices. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool): Whether to enable AWQ optimization (default False). + flash_attention (bool): Whether to use flash attention optimization (default False). + batch_size (int): Number of translations to process simultaneously (default 32). + max_length (int): Maximum lenght of the summary to be generated (default 512). + **kwargs: Arbitrary keyword arguments for model and generation configurations. + """ + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.batch_size = batch_size + self.notification_email = notification_email + self.compile = compile + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path, max_lengt=max_length) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset["text"] + + # Process data in batches + for i in range(0, len(dataset), batch_size): + batch = dataset[i : i + batch_size] + inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True) + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + # Generate summaries + summaries = self.model.generate(**inputs, **self.generation_args) + if next(self.model.parameters()).is_cuda: + summaries = summaries.cpu() + + decoded_summaries = [self.tokenizer.decode(s, skip_special_tokens=True) for s in summaries] + + self._save_summaries(decoded_summaries, batch, output_path, i) + self.done() + + def _save_summaries(self, summaries: List[str], input_batch: List[str], output_path: str, batch_idx: int) -> None: + """ + Saves the generated summaries to a specified output path. This method is called internally by the summarize method + to persist the summarization results. + + Args: + summaries (List[str]): List of generated summaries. + input_batch (List[str]): List of original texts that were summarized. + output_path (str): Path to save the summaries. + batch_idx (int): Index of the current batch (for naming files). + """ + data_to_save = [ + {"input": input_text, "summary": summary} for input_text, summary in zip(input_batch, summaries) + ] + with open(os.path.join(output_path, f"summaries-{batch_idx}-{str(uuid.uuid4())}.json"), "w") as f: + json.dump(data_to_save, f) diff --git a/huggingface/summarization.py b/geniusrise_text/summarization/fine_tune.py similarity index 73% rename from huggingface/summarization.py rename to geniusrise_text/summarization/fine_tune.py index d20b39b..869e567 100644 --- a/huggingface/summarization.py +++ b/geniusrise_text/summarization/fine_tune.py @@ -1,18 +1,17 @@ # 🧠 Geniusrise # Copyright (C) 2023 geniusrise.ai # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import os @@ -23,15 +22,15 @@ import numpy as np import pandas as pd import yaml # type: ignore -from datasets import DatasetDict, load_from_disk, load_metric, Dataset +from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric from pyarrow import feather from pyarrow import parquet as pq from transformers import DataCollatorForSeq2Seq, EvalPrediction -from .base import HuggingFaceFineTuner +from geniusrise_text.base import TextFineTuner -class HuggingFaceSummarizationFineTuner(HuggingFaceFineTuner): +class SummarizationFineTuner(TextFineTuner): r""" A bolt for fine-tuning Hugging Face models on summarization tasks. @@ -40,63 +39,21 @@ class HuggingFaceSummarizationFineTuner(HuggingFaceFineTuner): output (OutputConfig): The output data. state (State): The state manager. - ## Using geniusrise to invoke via command line - ```bash - genius HuggingFaceSummarizationFineTuner rise \ - streaming \ - --input_kafka_topic summarization_data \ - --input_kafka_cluster_connection_string localhost:9094 \ - --input_kafka_consumer_group_id geniusrise \ - streaming \ - --output_kafka_topic summarization_results \ - --output_kafka_cluster_connection_string localhost:9094 \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - load_dataset \ - --args dataset_path=my_dataset_path - ``` + CLI Usage: - ## Using geniusrise to invoke via YAML file - ```yaml - version: "1" - bolts: - my_summarization_bolt: - name: "HuggingFaceSummarizationFineTuner" - method: "load_dataset" - args: - dataset_path: "my_dataset_path" - input: - type: "streaming" - args: - input_topic: "summarization_data" - kafka_servers: "localhost:9094" - group_id: "geniusrise" - output: - type: "streaming" - args: - output_topic: "summarization_results" - kafka_servers: "localhost:9094" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "geniusrise" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_summarization_bolt" - namespace: "default" - image: "my_summarization_bolt_image" - replicas: 1 + ```bash + genius SummarizationFineTuner rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + fine_tune \ + --args \ + model_name=my_model \ + tokenizer_name=my_tokenizer \ + num_train_epochs=3 \ + per_device_train_batch_size=8 ``` """ @@ -165,7 +122,9 @@ def load_dataset(self, dataset_path: str, **kwargs: Any) -> Optional[DatasetDict """ try: - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): dataset = load_from_disk(dataset_path) else: data = [] @@ -211,8 +170,15 @@ def load_dataset(self, dataset_path: str, **kwargs: Any) -> Optional[DatasetDict elif filename.endswith(".feather"): df = feather.read_feather(filepath) data.extend(df.to_dict("records")) + dataset = Dataset.from_pandas(pd.DataFrame(data)) + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + tokenized_dataset = dataset.map( self.prepare_train_features, batched=True, diff --git a/geniusrise_text/summarization/tests/__init__.py b/geniusrise_text/summarization/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/summarization/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/summarization/tests/test_bulk.py b/geniusrise_text/summarization/tests/test_bulk.py new file mode 100644 index 0000000..9537007 --- /dev/null +++ b/geniusrise_text/summarization/tests/test_bulk.py @@ -0,0 +1,60 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +models = { + # fmt: off + "bart": "facebook/bart-large-cnn", + "distillbart": "sshleifer/distilbart-cnn-12-6", + "flan": "philschmid/flan-t5-base-samsum", + "pegasus": "google/pegasus-cnn_dailymail", + "led": "pszemraj/led-large-book-summary", + "food": "Dizex/FoodBaseBERT-NER", + "disease": "pruas/BENT-PubMedBERT-NER-Disease", + # fmt: on +} + + +# facebook/bart-large-cnn +# sshleifer/distilbart-cnn-12-6 +# philschmid/flan-t5-base-samsum +# google/pegasus-xsum +# pszemraj/led-large-book-summary +# knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM-AMI +# ainize/kobart-news +# human-centered-summarization/financial-summarization-pegasus +# Callidior/bert2bert-base-arxiv-titlegen +# pszemraj/long-t5-tglobal-base-16384-book-summary +# Quake24/easyTermsSummerizer +# facebook/bart-large-cnn + + +# google/bigbird-pegasus-large-arxiv +# google/bigbird-pegasus-large-bigpatent +# google/bigbird-pegasus-large-pubmed +# google/flan-t5-base +# google/flan-t5-large +# google/flan-t5-xl +# google/flan-t5-xxl +# google/t5-v1_1-base +# google/t5-v1_1-large +# google/t5-v1_1-xl +# google/t5-v1_1-xxl +# google/umt5-base +# google/umt5-xl +# google/umt5-xxl +# google/pegasus-x-base +# google/pegasus-large +# google/pegasus-x-large +# google/pegasus-multi_news diff --git a/geniusrise_text/summarization/tests/test_fine_tune.py b/geniusrise_text/summarization/tests/test_fine_tune.py new file mode 100644 index 0000000..3ff2f43 --- /dev/null +++ b/geniusrise_text/summarization/tests/test_fine_tune.py @@ -0,0 +1,287 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import numpy as np +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq +from transformers import EvalPrediction + +from geniusrise_text.summarization.fine_tune import SummarizationFineTuner + +lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", +} + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [{"document": f"document_{i}", "summary": f"summary_{i}"} for i in range(10)] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "document").text = item["document"] + ET.SubElement(record, "summary").text = item["summary"] + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + df.to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "json", + "jsonl", + "parquet", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir + "/train", ext) + create_dataset_in_format(tmpdir + "/eval", ext) + return tmpdir, ext + + +MODELS_TO_TEST = { + # fmt: off + "small": "facebook/bart-base", + # fmt: on +} + + +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + +@pytest.fixture +def summarization_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + klass = SummarizationFineTuner( + input=input, + output=output, + state=state, + ) + + return klass + + +def test_summarization_bolt_init(summarization_bolt, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForSeq2SeqLM" + tokenizer_class = "AutoTokenizer" + + summarization_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map=None, + ) + + assert summarization_bolt.model is not None + assert summarization_bolt.tokenizer is not None + assert summarization_bolt.input is not None + assert summarization_bolt.output is not None + assert summarization_bolt.state is not None + + +def test_load_dataset_all_formats(summarization_bolt, dataset_file, model): + tmpdir, ext = dataset_file + dataset_path = os.path.join(tmpdir, "train") + + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForSeq2SeqLM" + tokenizer_class = "AutoTokenizer" + + summarization_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map=None, + ) + + dataset = summarization_bolt.load_dataset(dataset_path) + assert dataset is not None + assert len(dataset) == 10 + + +# Models to test +models = { + # fmt: off + "bart": "facebook/bart-large-cnn", + "distillbart": "sshleifer/distilbart-cnn-12-6", + "flan": "google/flan-t5-xl", + "t5": "google/t5-v1_1-xl", + "mt5": "google/umt5-xl", + "pegasus": "google/pegasus-x-large", + "pegasus-news": "google/pegasus-multi_news", + "bigbird-arxiv": "google/bigbird-pegasus-large-arxiv", + "bigbird-bigpatent": "google/bigbird-pegasus-large-bigpatent", + "bigbird-pubmed": "google/bigbird-pegasus-large-pubmed", + # fmt: on +} + + +# Test for fine-tuning +@pytest.mark.parametrize( + "model_name, precision, quantization, lora_config, use_accelerate", + [ + # small + (models["bart"], "bfloat16", None, None, False), + (models["distillbart"], "bfloat16", None, None, False), + (models["flan"], "bfloat16", 8, lora_config, False), + (models["t5"], "bfloat16", 8, lora_config, False), + (models["mt5"], "bfloat16", 8, lora_config, False), + (models["pegasus"], "bfloat16", 8, lora_config, False), + (models["pegasus-news"], "bfloat16", None, None, False), + (models["bigbird-arxiv"], "bfloat16", None, None, False), + (models["bigbird-bigpatent"], "bfloat16", None, None, False), + (models["bigbird-pubmed"], "bfloat16", None, None, False), + ], +) +def test_summarization_bolt_fine_tune( + summarization_bolt, dataset_file, model_name, precision, quantization, lora_config, use_accelerate +): + try: + tokenizer_name = model_name + + tmpdir, ext = dataset_file + summarization_bolt.input.input_folder = tmpdir + + summarization_bolt.fine_tune( + model_name=model_name, + tokenizer_name=model_name, + model_class="AutoModelForSeq2SeqLM", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map=None, + ) + output_dir = summarization_bolt.output.output_folder + assert os.path.exists( + os.path.join(summarization_bolt.output.output_folder, "model", "pytorch_model.bin") + ) or os.path.exists(os.path.join(summarization_bolt.output.output_folder, "model", "adapter_model.bin")) + assert os.path.exists( + os.path.join(summarization_bolt.output.output_folder, "model", "config.json") + ) or os.path.exists(os.path.join(summarization_bolt.output.output_folder, "model", "adapter_config.json")) + assert os.path.exists(os.path.join(summarization_bolt.output.output_folder, "model", "training_args.bin")) + + del summarization_bolt.model + del summarization_bolt.tokenizer + torch.cuda.empty_cache() + + try: + os.remove(os.path.join(summarization_bolt.output.output_folder, "model", "pytorch_model.bin")) + os.remove(os.path.join(summarization_bolt.output.output_folder, "model", "adapter_model.bin")) + os.remove(os.path.join(summarization_bolt.output.output_folder, "model", "config.json")) + os.remove(os.path.join(summarization_bolt.output.output_folder, "model", "adapter_config.json")) + os.remove(os.path.join(summarization_bolt.output.output_folder, "model", "training_args.bin")) + except Exception as _: + pass + + except Exception as e: + del summarization_bolt.model + del summarization_bolt.tokenizer + torch.cuda.empty_cache() + raise + + +def test_summarization_bolt_compute_metrics(summarization_bolt, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForSeq2SeqLM" + tokenizer_class = "AutoTokenizer" + + summarization_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map=None, + ) + + logits = np.array([[0.6, 0.4], [0.4, 0.6]]) + labels = np.array([[0, 1], [1, 0]]) + eval_pred = EvalPrediction(predictions=logits, label_ids=labels) + metrics = summarization_bolt.compute_metrics(eval_pred) + assert "rouge1" in metrics + assert "rouge2" in metrics + assert "rougeL" in metrics diff --git a/geniusrise_text/translation/__init__.py b/geniusrise_text/translation/__init__.py new file mode 100644 index 0000000..16dcce4 --- /dev/null +++ b/geniusrise_text/translation/__init__.py @@ -0,0 +1,18 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .api import TranslationAPI +from .bulk import TranslationBulk +from .fine_tune import TranslationFineTuner diff --git a/geniusrise_text/translation/api.py b/geniusrise_text/translation/api.py new file mode 100644 index 0000000..c9febd7 --- /dev/null +++ b/geniusrise_text/translation/api.py @@ -0,0 +1,237 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Any, Dict + +import cherrypy +from geniusrise import BatchInput, BatchOutput, State +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline + +from geniusrise_text.base import TextAPI + +log = logging.getLogger(__name__) + + +class TranslationAPI(TextAPI): + r""" + A class for serving a Hugging Face-based translation model as a web API. + This API allows users to submit text for translation and receive translated text + in the specified target language using advanced machine learning models. + + Args: + input (BatchInput): Configurations and data inputs for the batch process. + output (BatchOutput): Configurations for output data handling. + state (State): State management for the translation task. + **kwargs: Additional keyword arguments for extended configurations. + + Example CLI Usage for interacting with the API: + + To start the API server: + ```bash + genius TranslationAPI rise \ + batch \ + --input_folder ./input \ + batch \ + --output_folder ./output \ + none \ + --id facebook/mbart-large-50-many-to-many-mmt-lol \ + listen \ + --args \ + model_name="facebook/mbart-large-50-many-to-many-mmt" \ + model_class="AutoModelForSeq2SeqLM" \ + tokenizer_class="AutoTokenizer" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + endpoint="*" \ + port=3000 \ + cors_domain="http://localhost:3000" \ + username="user" \ + password="password" + ``` + + To translate text using the API: + ```bash + curl -X POST localhost:8080/translate \ + -H "Content-Type: application/json" \ + -d '{ + "text": "Hello, world!", + "source_lang": "en", + "target_lang": "fr", + "decoding_strategy": "beam_search", + "num_beams": 5 + }' + ``` + """ + + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs: Any, + ) -> None: + super().__init__(input=input, output=output, state=state) + log.info("Loading Hugging Face translation API server") + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def translate(self, **kwargs: Any) -> Dict[str, Any]: + r""" + Translates text to a specified target language using the underlying Hugging Face model. + + This endpoint accepts JSON data with the text and language details, + processes it through the machine learning model, and returns the translated text. + + Args: + **kwargs: Arbitrary keyword arguments, usually empty as parameters are in the POST body. + + POST body parameters: + text (str): The text to be translated. + decoding_strategy (str): Strategy to use for decoding text; e.g., 'beam_search', 'greedy'. Default is 'generate'. + source_lang (str): Source language code. + target_lang (str): Target language code. Default is 'en'. + additional_params (dict): Other model-specific parameters for translation. + + Returns: + Dict[str, Any]: A dictionary with the original text, target language, and translated text. + + Example CURL requests: + + To translate text from English to French: + ```bash + curl -X POST localhost:8080/translate \ + -H "Content-Type: application/json" \ + -d '{ + "text": "Hello, world!", + "source_lang": "en", + "target_lang": "fr", + "decoding_strategy": "beam_search", + "num_beams": 5 + }' + ``` + + To translate text from Spanish to German: + ```bash + /usr/bin/curl -X POST localhost:3000/api/v1/translate \ + -H "Content-Type: application/json" \ + -d '{ + "text": "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है", + "source_lang": "hi_IN", + "target_lang": "en_XX", + "decoding_strategy": "generate", + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1 + }' | jq + ``` + """ + + data = cherrypy.request.json + text = data.get("text") + decoding_strategy = data.get("decoding_strategy", "generate") + src_lang = data.get("source_lang") + target_lang = data.get("target_lang", "en") + + generation_params = data + if "decoding_strategy" in generation_params: + del generation_params["decoding_strategy"] + if "source_lang" in generation_params: + del generation_params["source_lang"] + if "target_lang" in generation_params: + del generation_params["target_lang"] + if "text" in generation_params: + del generation_params["text"] + + # Tokenize the text + if src_lang: + self.tokenizer.src_lang = src_lang + if target_lang != "en": + generation_params = { + **generation_params, + **{"forced_bos_token_id": self.tokenizer.lang_code_to_id[target_lang]}, + } + + translated_text = self.generate(prompt=text, decoding_strategy=decoding_strategy, **generation_params) + + return { + "text": text, + "target_language": target_lang, + "translated_text": translated_text, + } + + def initialize_pipeline(self): + """ + Lazy initialization of the translation Hugging Face pipeline. + """ + if not self.hf_pipeline: + model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.hf_pipeline = pipeline("translation", model=model, tokenizer=tokenizer) + + @cherrypy.expose + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.allow(methods=["POST"]) + def translate_pipeline(self, **kwargs: Any) -> Dict[str, Any]: + """ + Endpoint for translating text using a pre-initialized Hugging Face translation pipeline. + This method is designed to handle translation requests more efficiently by utilizing + a preloaded model and tokenizer, reducing the overhead of loading these components for each request. + + Args: + None - Expects input through the POST request's JSON body. + + Returns: + Dict[str, Any]: A dictionary containing the original text, source language, target language, + and the translated text. + + Example CURL Request for translation: + ```bash + curl -X POST localhost:8080/translate_pipeline \ + -H "Content-Type: application/json" \ + -d '{ + "text": "Hello, world!", + "source_lang": "en", + "target_lang": "fr" + }' + ``` + """ + self.initialize_pipeline() # Initialize the pipeline on first API hit + + data = cherrypy.request.json + text = data.get("text") + src_lang = data.get("source_lang") + target_lang = data.get("target_lang", "en") + generation_params = {k: v for k, v in data.items() if k not in ["text", "source_lang", "target_lang"]} + + # Set the source and target language for the tokenizer + self.tokenizer.src_lang = src_lang + if target_lang != "en": + generation_params["forced_bos_token_id"] = self.tokenizer.lang_code_to_id[target_lang] + + result = self.hf_pipeline(text, **generation_params) # type: ignore + + return {"text": text, "source_language": src_lang, "target_language": target_lang, "translated_text": result} diff --git a/geniusrise_text/translation/api.yml b/geniusrise_text/translation/api.yml new file mode 100644 index 0000000..a2e655c --- /dev/null +++ b/geniusrise_text/translation/api.yml @@ -0,0 +1,122 @@ +openapi: 3.0.0 +info: + title: Translation API + description: Provides APIs for translating text using Hugging Face models. + version: "1.0" +servers: + - url: http://localhost:3000/api/v1 + description: Translation API server +paths: + /translate: + post: + summary: Translates text to the specified target language + operationId: translateText + tags: + - Translation + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The text to be translated. + source_lang: + type: string + description: Source language code. + target_lang: + type: string + description: Target language code. + decoding_strategy: + type: string + description: Strategy to use for decoding. Defaults to 'generate'. + default: "generate" + num_beams: + type: integer + description: Number of beams for beam search. Applicable if decoding_strategy is 'beam_search'. + additional_params: + type: object + additionalProperties: true + description: Other model-specific parameters for translation. + required: + - text + - source_lang + - target_lang + responses: + 200: + description: Text successfully translated + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The original text. + target_language: + type: string + description: The target language code. + translated_text: + type: string + description: The translated text. + 400: + description: Invalid input provided + 500: + description: Error in processing the request + + /translate_pipeline: + post: + summary: Translates text using a pre-initialized translation pipeline + operationId: translateTextPipeline + tags: + - Translation Pipeline + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The text to be translated. + source_lang: + type: string + description: Source language code. + target_lang: + type: string + description: Target language code. + additional_params: + type: object + additionalProperties: true + description: Other parameters for translation, specified as key-value pairs. + required: + - text + - source_lang + - target_lang + responses: + 200: + description: Text successfully translated using the pipeline + content: + application/json: + schema: + type: object + properties: + text: + type: string + description: The original text. + source_language: + type: string + description: The source language code. + target_language: + type: string + description: The target language code. + translated_text: + type: string + description: The translated text. + 400: + description: Invalid input provided + 500: + description: Error in processing the request diff --git a/geniusrise_text/translation/bulk.py b/geniusrise_text/translation/bulk.py new file mode 100644 index 0000000..b879472 --- /dev/null +++ b/geniusrise_text/translation/bulk.py @@ -0,0 +1,381 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import sqlite3 +import uuid +import xml.etree.ElementTree as ET +from typing import Any, Dict, List, Optional + +import pandas as pd +import yaml # type: ignore +from datasets import Dataset, load_from_disk +from geniusrise import BatchInput, BatchOutput, State +from pyarrow import feather +from pyarrow import parquet as pq + +from geniusrise_text.base import TextBulk + + +class TranslationBulk(TextBulk): + r""" + TranslationBulk is a class for managing bulk translations using Hugging Face models. It is designed to + handle large-scale translation tasks efficiently and effectively, using state-of-the-art machine learning models + to provide high-quality translations for various language pairs. + + This class provides methods for loading datasets, configuring translation models, and executing bulk translation tasks. + + Args: + input (BatchInput): Configuration and data inputs for batch processing. + output (BatchOutput): Configuration for output data handling. + state (State): State management for translation tasks. + **kwargs: Arbitrary keyword arguments for extended functionality. + + Example CLI Usage for Bulk Translation Task: + + ```bash + genius TranslationBulk rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/trans \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/trans \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id facebook/mbart-large-50-many-to-many-mmt-lol \ + translate \ + --args \ + model_name="facebook/mbart-large-50-many-to-many-mmt" \ + model_class="AutoModelForSeq2SeqLM" \ + tokenizer_class="AutoTokenizer" \ + origin="hi_IN" \ + target="en_XX" \ + use_cuda=True \ + precision="float" \ + quantization=0 \ + device_map="cuda:0" \ + max_memory=None \ + torchscript=False \ + generate_decoder_start_token_id=2 \ + generate_early_stopping=true \ + generate_eos_token_id=2 \ + generate_forced_eos_token_id=2 \ + generate_max_length=200 \ + generate_num_beams=5 \ + generate_pad_token_id=1 + ``` + """ + + def __init__(self, input: BatchInput, output: BatchOutput, state: State, **kwargs) -> None: + super().__init__(input, output, state, **kwargs) + + def load_dataset( + self, + dataset_path: str, + max_length: int = 512, + origin: str = "en", + target: str = "hi", + **kwargs, + ) -> Optional[Dataset]: + r""" + Load a dataset from a directory. + + ## Supported Data Formats and Structures for Translation Tasks: + + Note: All examples are assuming the source as "en", refer to the specific model for this parameter. + + ### JSONL + Each line is a JSON object representing an example. + ```json + { + "translation": { + "en": "English text" + } + } + ``` + + ### CSV + Should contain 'en' column. + ```csv + en + "English text" + ``` + + ### Parquet + Should contain 'en' column. + + ### JSON + An array of dictionaries with 'en' key. + ```json + [ + { + "en": "English text" + } + ] + ``` + + ### XML + Each 'record' element should contain 'en' child elements. + ```xml + + English text + + ``` + + ### YAML + Each document should be a dictionary with 'en' key. + ```yaml + - en: "English text" + ``` + + ### TSV + Should contain 'en' column separated by tabs. + + ### Excel (.xls, .xlsx) + Should contain 'en' column. + + ### SQLite (.db) + Should contain a table with 'en' column. + + ### Feather + Should contain 'en' column. + + Args: + dataset_path (str): The path to the directory containing the dataset files. + max_length (int, optional): The maximum length for tokenization. Defaults to 512. + origin (str, optional): The origin language. Defaults to 'en'. + target (str, optional): The target language. Defaults to 'hi'. + **kwargs: Additional keyword arguments. + + Returns: + DatasetDict: The loaded dataset. + """ + + self.max_length = max_length + self.origin = origin + self.target = target + if self.origin: + self.tokenizer.src_lang = self.origin + + try: + if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + dataset = load_from_disk(dataset_path) + else: + data = [] + for filename in glob.glob(f"{dataset_path}/**/*", recursive=True): + filepath = os.path.join(dataset_path, filename) + if filename.endswith(".jsonl"): + with open(filepath, "r") as f: + for line in f: + example = json.loads(line) + data.append(example) + elif filename.endswith(".csv"): + df = pd.read_csv(filepath) + data.extend(df.to_dict("records")) + elif filename.endswith(".parquet"): + df = pq.read_table(filepath).to_pandas() + data.extend(df.to_dict("records")) + elif filename.endswith(".json"): + with open(filepath, "r") as f: + json_data = json.load(f) + data.extend(json_data) + elif filename.endswith(".xml"): + tree = ET.parse(filepath) + root = tree.getroot() + for record in root.findall("record"): + origin = record.find(self.origin).text # type: ignore + data.append({"translation": {self.origin: origin}}) + elif filename.endswith(".yaml") or filename.endswith(".yml"): + with open(filepath, "r") as f: + yaml_data = yaml.safe_load(f) + data.extend(yaml_data) + elif filename.endswith(".tsv"): + df = pd.read_csv(filepath, sep="\t") + data.extend(df.to_dict("records")) + elif filename.endswith((".xls", ".xlsx")): + df = pd.read_excel(filepath) + data.extend(df.to_dict("records")) + elif filename.endswith(".db"): + conn = sqlite3.connect(filepath) + query = f"SELECT {self.origin} FROM dataset_table;" + df = pd.read_sql_query(query, conn) + data.extend(df.to_dict("records")) + elif filename.endswith(".feather"): + df = feather.read_feather(filepath) + data.extend(df.to_dict("records")) + + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + data = [fn(d) for d in data] + else: + data = data + + dataset = Dataset.from_pandas(pd.DataFrame(data)) + + return dataset + except Exception as e: + self.log.exception(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") + raise + + def translate( + self, + model_name: str, + origin: str, + target: str, + max_length: int = 512, + model_class: str = "AutoModelForSeq2SeqLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + max_memory={0: "24GB"}, + torchscript: bool = False, + compile: bool = False, + awq_enabled: bool = False, + flash_attention: bool = False, + batch_size: int = 32, + notification_email: Optional[str] = None, + **kwargs: Any, + ) -> None: + """ + Perform bulk translation using the specified model and tokenizer. This method handles the entire translation + process including loading the model, processing input data, generating translations, and saving the results. + + Args: + model_name (str): Name or path of the translation model. + origin (str): Source language ISO code. + target (str): Target language ISO code. + max_length (int): Maximum length of the tokens (default 512). + model_class (str): Class name of the model (default "AutoModelForSeq2SeqLM"). + tokenizer_class (str): Class name of the tokenizer (default "AutoTokenizer"). + use_cuda (bool): Whether to use CUDA for model inference (default False). + precision (str): Precision for model computation (default "float16"). + quantization (int): Level of quantization for optimizing model size and speed (default 0). + device_map (str | Dict | None): Specific device to use for computation (default "auto"). + max_memory (Dict): Maximum memory configuration for devices. + torchscript (bool, optional): Whether to use a TorchScript-optimized version of the pre-trained language model. Defaults to False. + compile (bool, optional): Whether to compile the model before fine-tuning. Defaults to True. + awq_enabled (bool): Whether to enable AWQ optimization (default False). + flash_attention (bool): Whether to use flash attention optimization (default False). + batch_size (int): Number of translations to process simultaneously (default 32). + **kwargs: Arbitrary keyword arguments for model and generation configurations. + """ + + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.max_memory = max_memory + self.torchscript = torchscript + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.batch_size = batch_size + self.notification_email = notification_email + self.compile = compile + + model_args = {k.replace("model_", ""): v for k, v in kwargs.items() if "model_" in k} + self.model_args = model_args + + generation_args = {k.replace("generation_", ""): v for k, v in kwargs.items() if "generation_" in k} + self.generation_args = generation_args + + self.model, self.tokenizer = self.load_models( + model_name=self.model_name, + tokenizer_name=self.tokenizer_name, + model_revision=self.model_revision, + tokenizer_revision=self.tokenizer_revision, + model_class=self.model_class, + tokenizer_class=self.tokenizer_class, + use_cuda=self.use_cuda, + precision=self.precision, + quantization=self.quantization, + device_map=self.device_map, + max_memory=self.max_memory, + torchscript=self.torchscript, + awq_enabled=self.awq_enabled, + flash_attention=self.flash_attention, + compile=self.compile, + **self.model_args, + ) + + dataset_path = self.input.input_folder + output_path = self.output.output_folder + + # Load dataset + _dataset = self.load_dataset(dataset_path, origin=origin, target=target, max_length=max_length) + if _dataset is None: + self.log.error("Failed to load dataset.") + return + dataset = _dataset[self.origin] + + # Process data in batches + for i in range(0, len(dataset), batch_size): + batch = dataset[i : i + batch_size] + inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True) + + if next(self.model.parameters()).is_cuda: + inputs = {k: v.cuda() for k, v in inputs.items()} + + outputs = self.model.generate(**inputs, **self.generation_args) + translations = [self.tokenizer.decode(t, skip_special_tokens=True) for t in outputs] + + self._save_translations(translations, batch, output_path, i) + self.done() + + def _save_translations( + self, translations: List[str], input_batch: List[str], output_path: str, batch_idx: int + ) -> None: + r""" + Saves the translated texts to a specified output path. This method is called internally by the translate method + to persist the translation results. + + Args: + translations (List[str]): List of translated texts. + input_batch (List[str]): List of original texts that were translated. + output_path (str): Path to save the translated texts. + batch_idx (int): Index of the current batch (for naming files). + """ + + data_to_save = [ + {"input": input_text, "translation": translation} + for input_text, translation in zip(input_batch, translations) + ] + with open(os.path.join(output_path, f"translations-{batch_idx}-{str(uuid.uuid4())}.json"), "w") as f: + json.dump(data_to_save, f) diff --git a/huggingface/translation.py b/geniusrise_text/translation/fine_tune.py similarity index 72% rename from huggingface/translation.py rename to geniusrise_text/translation/fine_tune.py index 11d5885..477c557 100644 --- a/huggingface/translation.py +++ b/geniusrise_text/translation/fine_tune.py @@ -1,18 +1,17 @@ # 🧠 Geniusrise # Copyright (C) 2023 geniusrise.ai # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +# http://www.apache.org/licenses/LICENSE-2.0 # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import ast import json @@ -23,15 +22,15 @@ import pandas as pd import yaml # type: ignore -from datasets import Dataset, DatasetDict, load_from_disk +from datasets import Dataset, DatasetDict, load_dataset, load_from_disk from pyarrow import feather from pyarrow import parquet as pq from transformers import DataCollatorForSeq2Seq -from .base import HuggingFaceFineTuner +from geniusrise_text.base import TextFineTuner -class HuggingFaceTranslationFineTuner(HuggingFaceFineTuner): +class TranslationFineTuner(TextFineTuner): r""" A bolt for fine-tuning Hugging Face models on translation tasks. @@ -40,58 +39,34 @@ class HuggingFaceTranslationFineTuner(HuggingFaceFineTuner): input (BatchInput): The batch input data. output (OutputConfig): The output data. state (State): The state manager. + **kwargs: Arbitrary keyword arguments for extended functionality. ``` - ## Using geniusrise to invoke via command line - ```bash - genius HuggingFaceTranslationFineTuner rise \ - streaming \ - --output_kafka_topic translation_test \ - --output_kafka_cluster_connection_string localhost:9094 \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - load_dataset \ - --args dataset_path=my_dataset max_length=512 origin=en target=fr - ``` + CLI Usage: - ## Using geniusrise to invoke via YAML file - ```yaml - version: "1" - bolts: - my_translation_bolt: - name: "HuggingFaceTranslationFineTuner" - method: "load_dataset" - args: - dataset_path: "my_dataset" - max_length: 512 - origin: "en" - target: "fr" - output: - type: "streaming" - args: - output_topic: "translation_test" - kafka_servers: "localhost:9094" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "geniusrise" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_translation_bolt" - namespace: "default" - image: "my_translation_bolt_image" - replicas: 1 + ```bash + genius TranslationFineTuner rise \ + batch \ + --input_s3_bucket geniusrise-test \ + --input_s3_folder input/trans \ + batch \ + --output_s3_bucket geniusrise-test \ + --output_s3_folder output/trans \ + postgres \ + --postgres_host 127.0.0.1 \ + --postgres_port 5432 \ + --postgres_user postgres \ + --postgres_password postgres \ + --postgres_database geniusrise\ + --postgres_table state \ + --id facebook/mbart-large-50-many-to-many-mmt-lol \ + fine_tune \ + --args \ + model_name=my_model \ + tokenizer_name=my_tokenizer \ + num_train_epochs=3 \ + per_device_train_batch_size=8 \ + data_max_length=512 ``` """ @@ -181,9 +156,12 @@ def load_dataset( self.max_length = max_length self.origin = origin self.target = target + self.tokenizer.src_lang = self.origin try: - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): + if self.use_huggingface_dataset: + dataset = load_dataset(self.huggingface_dataset) + elif os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): dataset = load_from_disk(dataset_path) else: data = [] @@ -208,9 +186,9 @@ def load_dataset( tree = ET.parse(filepath) root = tree.getroot() for record in root.findall("record"): - en = record.find(self.origin).text # type: ignore - fr = record.find(self.target).text # type: ignore - data.append({"translation": {self.origin: en, self.target: fr}}) + origin = record.find(self.origin).text # type: ignore + target = record.find(self.target).text # type: ignore + data.append({"translation": {self.origin: origin, self.target: target}}) elif filename.endswith(".yaml") or filename.endswith(".yml"): with open(filepath, "r") as f: yaml_data = yaml.safe_load(f) @@ -229,8 +207,15 @@ def load_dataset( elif filename.endswith(".feather"): df = feather.read_feather(filepath) data.extend(df.to_dict("records")) + dataset = Dataset.from_pandas(pd.DataFrame(data)) + if hasattr(self, "map_data") and self.map_data: + fn = eval(self.map_data) # type: ignore + dataset = dataset.map(fn) + else: + dataset = dataset + tokenized_dataset = dataset.map( self.prepare_train_features, batched=True, diff --git a/geniusrise_text/translation/tests/__init__.py b/geniusrise_text/translation/tests/__init__.py new file mode 100644 index 0000000..bffa8ca --- /dev/null +++ b/geniusrise_text/translation/tests/__init__.py @@ -0,0 +1,14 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/geniusrise_text/translation/tests/test_fine_tune.py b/geniusrise_text/translation/tests/test_fine_tune.py new file mode 100644 index 0000000..48d4aa9 --- /dev/null +++ b/geniusrise_text/translation/tests/test_fine_tune.py @@ -0,0 +1,275 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sqlite3 +import tempfile +import xml.etree.ElementTree as ET + +import numpy as np +import pandas as pd +import pytest +import torch +import yaml # type: ignore +from datasets import Dataset +from geniusrise.core import BatchInput, BatchOutput, InMemoryState +from pyarrow import feather +from pyarrow import parquet as pq +from transformers import EvalPrediction + +from geniusrise_text.translation.fine_tune import TranslationFineTuner + +lora_config = { + "r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "CAUSAL_LM", +} + + +# Helper function to create synthetic data in different formats +def create_dataset_in_format(directory, ext): + os.makedirs(directory, exist_ok=True) + data = [ + { + "translation": { + "en": f"This is a synthetic text example {i}", + "fr": f"C'est un exemple de texte synthétique {i}", + } + } + for i in range(10) + ] + df = pd.DataFrame(data) + + if ext == "huggingface": + dataset = Dataset.from_pandas(df) + dataset.save_to_disk(directory) + elif ext == "csv": + df.to_csv(os.path.join(directory, "data.csv"), index=False) + elif ext == "jsonl": + with open(os.path.join(directory, "data.jsonl"), "w") as f: + for item in data: + f.write(json.dumps(item) + "\n") + elif ext == "parquet": + pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) + elif ext == "json": + with open(os.path.join(directory, "data.json"), "w") as f: + json.dump(data, f) + elif ext == "xml": + root = ET.Element("root") + for item in data: + record = ET.SubElement(root, "record") + ET.SubElement(record, "en").text = str(item["translation"]["en"]) + ET.SubElement(record, "fr").text = str(item["translation"]["fr"]) + tree = ET.ElementTree(root) + tree.write(os.path.join(directory, "data.xml")) + elif ext == "yaml": + with open(os.path.join(directory, "data.yaml"), "w") as f: + yaml.dump(data, f) + elif ext == "tsv": + df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") + elif ext == "xlsx": + df.to_excel(os.path.join(directory, "data.xlsx"), index=False) + elif ext == "db": + conn = sqlite3.connect(os.path.join(directory, "data.db")) + ens = [x["translation"]["en"] for x in data] + frs = [x["translation"]["fr"] for x in data] + pd.DataFrame({"en": ens, "fr": frs}).to_sql("dataset_table", conn, if_exists="replace", index=False) + conn.close() + elif ext == "feather": + feather.write_feather(df, os.path.join(directory, "data.feather")) + else: + raise ValueError(f"Unsupported file extension: {ext}") + + +# Fixtures for each file type +@pytest.fixture( + params=[ + "huggingface", + "csv", + "json", + "jsonl", + "parquet", + "xml", + "yaml", + "tsv", + "xlsx", + "db", + "feather", + ] +) +def dataset_file(request, tmpdir): + ext = request.param + create_dataset_in_format(tmpdir + "/train", ext) + create_dataset_in_format(tmpdir + "/eval", ext) + return tmpdir, ext + + +MODELS_TO_TEST = { + # fmt: off + "many": "Helsinki-NLP/opus-mt-en-hi", + # fmt: on +} + + +# Fixture for models +@pytest.fixture(params=MODELS_TO_TEST.items()) +def model(request): + return request.param + + +@pytest.fixture +def translation_bolt(): + input_dir = tempfile.mkdtemp() + output_dir = tempfile.mkdtemp() + + input = BatchInput(input_dir, "geniusrise-test", "test-🤗-input") + output = BatchOutput(output_dir, "geniusrise-test", "test-🤗-output") + state = InMemoryState() + + klass = TranslationFineTuner( + input=input, + output=output, + state=state, + evaluate=True, + ) + return klass + + +def test_translation_bolt_init(translation_bolt, model): + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForSeq2SeqLM" + tokenizer_class = "AutoTokenizer" + + translation_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map=None, + precision="float16", + ) + + assert translation_bolt.model is not None + assert translation_bolt.tokenizer is not None + assert translation_bolt.input is not None + assert translation_bolt.output is not None + assert translation_bolt.state is not None + + +def test_load_dataset_all_formats(translation_bolt, dataset_file, model): + tmpdir, ext = dataset_file + dataset_path = os.path.join(tmpdir, "train") + + name, model_name = model + tokenizer_name = model_name + model_class = "AutoModelForSeq2SeqLM" + tokenizer_class = "AutoTokenizer" + + translation_bolt.load_models( + model_name=model_name, + tokenizer_name=tokenizer_name, + model_class=model_class, + tokenizer_class=tokenizer_class, + device_map=None, + ) + + dataset = translation_bolt.load_dataset(dataset_path) + assert dataset is not None + assert len(dataset) == 10 + + +# Models to test +models = { + # fmt: off + "hi": "Helsinki-NLP/opus-mt-en-hi", + "en": "Helsinki-NLP/opus-mt-en-hi", + # fmt: on +} + + +# Test for fine-tuning +@pytest.mark.parametrize( + "model_name, precision, quantization, lora_config, use_accelerate", + [ + # small + (models["hi"], "float16", None, None, False), + (models["en"], "float16", None, None, False), + ], +) +def test_translation_bolt_fine_tune( + translation_bolt, dataset_file, model_name, precision, quantization, lora_config, use_accelerate +): + try: + tokenizer_name = model_name + + tmpdir, ext = dataset_file + translation_bolt.input.input_folder = tmpdir + + translation_bolt.fine_tune( + model_name=model_name, + tokenizer_name=model_name, + model_class="AutoModelForSeq2SeqLM", + tokenizer_class="AutoTokenizer", + num_train_epochs=1, + per_device_batch_size=2, + precision=precision, + quantization=quantization, + lora_config=lora_config, + use_accelerate=use_accelerate, + device_map=None, + ) + output_dir = translation_bolt.output.output_folder + assert os.path.exists( + os.path.join(translation_bolt.output.output_folder, "model", "pytorch_model.bin") + ) or os.path.exists(os.path.join(translation_bolt.output.output_folder, "model", "adapter_model.bin")) + assert os.path.exists( + os.path.join(translation_bolt.output.output_folder, "model", "config.json") + ) or os.path.exists(os.path.join(translation_bolt.output.output_folder, "model", "adapter_config.json")) + assert os.path.exists(os.path.join(translation_bolt.output.output_folder, "model", "training_args.bin")) + + del translation_bolt.model + del translation_bolt.tokenizer + torch.cuda.empty_cache() + + try: + os.remove(os.path.join(translation_bolt.output.output_folder, "model", "pytorch_model.bin")) + os.remove(os.path.join(translation_bolt.output.output_folder, "model", "adapter_model.bin")) + os.remove(os.path.join(translation_bolt.output.output_folder, "model", "config.json")) + os.remove(os.path.join(translation_bolt.output.output_folder, "model", "adapter_config.json")) + os.remove(os.path.join(translation_bolt.output.output_folder, "model", "training_args.bin")) + except Exception as _: + pass + + except Exception as e: + del translation_bolt.model + del translation_bolt.tokenizer + torch.cuda.empty_cache() + raise + + +# Test for computing metrics +def test_translation_bolt_compute_metrics(translation_bolt): + logits = np.array([[0.6, 0.4], [0.4, 0.6]]) + labels = np.array([0, 1]) + eval_pred = EvalPrediction(predictions=logits, label_ids=labels) + metrics = translation_bolt.compute_metrics(eval_pred) + assert "accuracy" in metrics + assert "precision" in metrics + assert "recall" in metrics + assert "f1" in metrics diff --git a/geniusrise_text/translation/tests/tests/test_bulk.py b/geniusrise_text/translation/tests/tests/test_bulk.py new file mode 100644 index 0000000..0873c0c --- /dev/null +++ b/geniusrise_text/translation/tests/tests/test_bulk.py @@ -0,0 +1,24 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# # Models to test +# models = { +# # fmt: off +# "bart": "dslim/bert-large-NER", +# "50": "facebook/mbart-large-50-many-to-many-mmt", +# "to-en": "facebook/wmt21-dense-24-wide-x-en", +# "from-en": "facebook/wmt21-dense-24-wide-en-x", +# # fmt: on +# } diff --git a/huggingface/__init__.py b/huggingface/__init__.py deleted file mode 100644 index a1ec3a9..0000000 --- a/huggingface/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .base import HuggingFaceFineTuner -from .classification import HuggingFaceClassificationFineTuner -from .commonsense_reasoning import HuggingFaceCommonsenseReasoningFineTuner -from .instruction_tuning import HuggingFaceInstructionTuningFineTuner -from .language_model import HuggingFaceLanguageModelingFineTuner -from .ner import HuggingFaceNamedEntityRecognitionFineTuner -from .question_answering import HuggingFaceQuestionAnsweringFineTuner -from .sentiment_analysis import HuggingFaceSentimentAnalysisFineTuner -from .summarization import HuggingFaceSummarizationFineTuner -from .translation import HuggingFaceTranslationFineTuner diff --git a/huggingface/base.py b/huggingface/base.py deleted file mode 100644 index c444389..0000000 --- a/huggingface/base.py +++ /dev/null @@ -1,277 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import logging -import os -from abc import abstractmethod -from typing import Dict, Optional - -import numpy as np -from datasets import Dataset, DatasetDict -from geniusrise import BatchInput, BatchOutput, Bolt, State -from sklearn.metrics import accuracy_score, precision_recall_fscore_support -from transformers import EvalPrediction, Trainer, TrainingArguments - - -class HuggingFaceFineTuner(Bolt): - """ - A bolt for fine-tuning Hugging Face models. - - This bolt uses the Hugging Face Transformers library to fine-tune a pre-trained model. - It uses the `Trainer` class from the Transformers library to handle the training. - """ - - def __init__( - self, - input: BatchInput, - output: BatchOutput, - state: State, - **kwargs, - ) -> None: - """ - Initialize the bolt. - - Args: - input (BatchInput): The batch input data. - output (OutputConfig): The output data. - state (State): The state manager. - **kwargs: Additional keyword arguments. - """ - super().__init__( - input=input, - output=output, - state=state, - ) - super().__init__(input=input, output=output, state=state) - self.input = input - self.output = output - self.state = state - - self.model_name: Optional[str] = None - self.tokenizer_name: Optional[str] = None - self.model_class: Optional[str] = None - self.tokenizer_class: Optional[str] = None - self.eval: bool = False - - self.tokenizer = None - self.model = None - self.train_dataset = None - self.eval_dataset = None - - self.log = logging.getLogger(self.__class__.__name__) - - @abstractmethod - def load_dataset(self, dataset_path: str, **kwargs) -> Dataset | DatasetDict | Optional[Dataset]: - """ - Load a dataset from a file. - - Args: - dataset_path (str): The path to the dataset file. - **kwargs: Additional keyword arguments to pass to the `load_dataset` method. - - Returns: - Dataset: The loaded dataset. - - Raises: - NotImplementedError: This method should be overridden by subclasses. - """ - raise NotImplementedError("Subclasses should implement this!") - - def preprocess_data(self, **kwargs): - """Load and preprocess the dataset""" - try: - self.input.copy_from_remote() - train_dataset_path = os.path.join(self.input.get(), "train") - eval_dataset_path = os.path.join(self.input.get(), "eval") - self.train_dataset = self.load_dataset(train_dataset_path, **kwargs) - if self.eval: - self.eval_dataset = self.load_dataset(eval_dataset_path, **kwargs) - except Exception as e: - self.log.exception(f"Failed to preprocess data: {e}") - raise - - def load_models(self): - """Load the model and tokenizer""" - try: - # TODO: also use autoconfig to load configs - if self.model_name.lower() == "local": - self.model = getattr(__import__("transformers"), str(self.model_class)).from_pretrained( - os.path.join(self.input.get(), "/model") - ) - else: - self.model = getattr(__import__("transformers"), str(self.model_class)).from_pretrained(self.model_name) - - if self.tokenizer_name.lower() == "local": - self.tokenizer = getattr(__import__("transformers"), str(self.tokenizer_class)).from_pretrained( - os.path.join(self.input.get(), "/model") - ) - else: - self.tokenizer = getattr(__import__("transformers"), str(self.tokenizer_class)).from_pretrained( - self.tokenizer_name - ) - except Exception as e: - self.log.exception(f"Failed to load model: {e}") - raise - - def upload_to_hf_hub(self): - """Upload the model and tokenizer to Hugging Face Hub.""" - try: - if self.model: - self.model.push_to_hub( - repo_id=self.hf_repo_id, - commit_message=self.hf_commit_message, - token=self.hf_token, - private=self.hf_private, - create_pr=self.hf_create_pr, - ) - if self.tokenizer: - self.tokenizer.push_to_hub( - repo_id=self.hf_repo_id, - commit_message=self.hf_commit_message, - token=self.hf_token, - private=self.hf_private, - create_pr=self.hf_create_pr, - ) - except Exception as e: - self.log.exception(f"Failed to upload model to huggingface hub: {e}") - raise - - def compute_metrics(self, eval_pred: EvalPrediction) -> Optional[Dict[str, float]] | Dict[str, float]: - """ - Compute metrics for evaluation. This class implements a simple classification evaluation, tasks should ideally override this. - - Args: - eval_pred (EvalPrediction): The evaluation predictions. - - Returns: - dict: The computed metrics. - """ - predictions, labels = eval_pred - predictions = predictions[0] if isinstance(predictions, tuple) else predictions - labels = labels[0] if isinstance(labels, tuple) else labels - predictions = np.argmax(predictions, axis=1) - - return { - "accuracy": accuracy_score(labels, predictions), - "precision": precision_recall_fscore_support(labels, predictions, average="binary")[0], - "recall": precision_recall_fscore_support(labels, predictions, average="binary")[1], - "f1": precision_recall_fscore_support(labels, predictions, average="binary")[2], - } - - def fine_tune( - self, - model_name: str, - tokenizer_name: str, - num_train_epochs: int, - per_device_train_batch_size: int, - model_class: str = "AutoModel", - tokenizer_class: str = "AutoTokenizer", - eval: bool = False, - hf_repo_id: Optional[str] = None, - hf_commit_message: Optional[str] = None, - hf_token: Optional[str] = None, - hf_private: bool = True, - hf_create_pr: bool = False, - **kwargs, - ): - """ - Fine-tune the model. - - Args: - model_name (str): The pre-trained model name. - tokenizer_name (str): The pre-trained tokenizer name. - num_train_epochs (int): Total number of training epochs to perform. - per_device_train_batch_size (int): Batch size per device during training. - model_class (str, optional): The model class to use. Defaults to "AutoModel". - tokenizer_class (str, optional): The tokenizer class to use. Defaults to "AutoTokenizer". - eval (bool, optional): Whether to evaluate the model after training. Defaults to False. - hf_repo_id (str, optional): The Hugging Face repo ID. Defaults to None. - hf_commit_message (str, optional): The Hugging Face commit message. Defaults to None. - hf_token (str, optional): The Hugging Face token. Defaults to None. - hf_private (bool, optional): Whether to make the repo private. Defaults to True. - hf_create_pr (bool, optional): Whether to create a pull request. Defaults to False. - **kwargs: Additional keyword arguments for training. - - Raises: - Exception: If any step in the fine-tuning process fails. - """ - try: - # Save everything - self.model_name = model_name - self.tokenizer_name = tokenizer_name - self.output_dir = self.output.output_folder - self.num_train_epochs = num_train_epochs - self.per_device_train_batch_size = per_device_train_batch_size - self.model_class = model_class - self.tokenizer_class = tokenizer_class - self.eval = eval - self.hf_repo_id = hf_repo_id - self.hf_commit_message = hf_commit_message - self.hf_token = hf_token - self.hf_private = hf_private - self.hf_create_pr = hf_create_pr - - # Load model and tokenizer - self.load_models() - - # Load dataset - dataset_kwargs = {k.replace("data_", ""): v for k, v in kwargs.items() if "data_" in k} - self.preprocess_data(**dataset_kwargs) - - # Separate training and evaluation arguments - trainer_kwargs = {k.replace("trainer_", ""): v for k, v in kwargs.items() if "trainer_" in k} - training_kwargs = { - k.replace("data_", ""): v for k, v in kwargs.items() if "data_" not in k and "trainer" not in k - } - - # Create training arguments - training_args = TrainingArguments( - output_dir=os.path.join(self.output_dir, "model"), - num_train_epochs=num_train_epochs, - per_device_train_batch_size=per_device_train_batch_size, - **training_kwargs, - ) - - # Create trainer - trainer = Trainer( - model=self.model, - args=training_args, - train_dataset=self.train_dataset, - eval_dataset=self.eval_dataset if self.eval else None, - tokenizer=self.tokenizer, - compute_metrics=self.compute_metrics, - data_collator=self.data_collator if hasattr(self, "data_collator") else None, - **trainer_kwargs, - ) - - # if self.tokenizer and trainer.tokenizer.pad_token is None: - # trainer.tokenizer.pad_token = trainer.tokenizer.eos_token - - # Lets go! - trainer.train() - trainer.save_model() - - if self.eval: - eval_result = trainer.evaluate() - self.log.info(f"Evaluation results: {eval_result}") - - if self.hf_repo_id: - self.upload_to_hf_hub() - except Exception as e: - self.log.exception(f"Failed to fine tune model: {e}") - self.state.set_state(self.id, {"success": False, "exception": str(e)}) - raise - self.state.set_state(self.id, {"success": True}) diff --git a/huggingface/classification.py b/huggingface/classification.py deleted file mode 100644 index e08f33a..0000000 --- a/huggingface/classification.py +++ /dev/null @@ -1,253 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import logging -import os -import sqlite3 -import xml.etree.ElementTree as ET -from typing import Optional - -import pandas as pd -import yaml # type: ignore -from datasets import Dataset, load_from_disk -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import DataCollatorWithPadding - -from .base import HuggingFaceFineTuner - - -class HuggingFaceClassificationFineTuner(HuggingFaceFineTuner): - r""" - A bolt for fine-tuning Hugging Face models for text classification tasks. - - Args: - input (BatchInput): The batch input data. - output (OutputConfig): The output data. - state (State): The state manager. - - ## Using geniusrise to invoke via command line - ```bash - genius HuggingFaceClassificationFineTuner rise \ - batch \ - --input_folder my_dataset \ - streaming \ - --output_kafka_topic my_topic \ - --output_kafka_cluster_connection_string localhost:9094 \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - load_dataset \ - --args dataset_path=my_dataset max_length=512 - ``` - - ## Using geniusrise to invoke via YAML file - ```yaml - version: "1" - bolts: - my_fine_tuner: - name: "HuggingFaceClassificationFineTuner" - method: "load_dataset" - args: - dataset_path: "my_dataset" - max_length: 512 - input: - type: "batch" - args: - folder: "my_dataset" - output: - type: "streaming" - args: - output_topic: "my_topic" - kafka_servers: "localhost:9094" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "geniusrise" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_fine_tuner" - namespace: "default" - image: "my_fine_tuner_image" - replicas: 1 - ``` - """ - - def load_dataset(self, dataset_path: str, max_length: int = 512, **kwargs) -> Optional[Dataset]: - r""" - Load a classification dataset from a directory. - - Args: - dataset_path (str): The path to the dataset directory. - max_length (int, optional): The maximum length for tokenization. Defaults to 512. - - Returns: - Dataset: The loaded dataset. - - Raises: - Exception: If there was an error loading the dataset. - - ## Supported Data Formats and Structures: - - ### JSONL - Each line is a JSON object representing an example. - ```json - {"text": "The text content", "label": "The label"} - ``` - - ### CSV - Should contain 'text' and 'label' columns. - ```csv - text,label - "The text content","The label" - ``` - - ### Parquet - Should contain 'text' and 'label' columns. - - ### JSON - An array of dictionaries with 'text' and 'label' keys. - ```json - [{"text": "The text content", "label": "The label"}] - ``` - - ### XML - Each 'record' element should contain 'text' and 'label' child elements. - ```xml - - The text content - - - ``` - - ### YAML - Each document should be a dictionary with 'text' and 'label' keys. - ```yaml - - text: "The text content" - label: "The label" - ``` - - ### TSV - Should contain 'text' and 'label' columns separated by tabs. - - ### Excel (.xls, .xlsx) - Should contain 'text' and 'label' columns. - - ### SQLite (.db) - Should contain a table with 'text' and 'label' columns. - - ### Feather - Should contain 'text' and 'label' columns. - """ - - self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) - self.max_length = max_length - - self.label_to_id = self.model.config.label2id if self.model and self.model.config.label2id else None # type: ignore - - def tokenize_function(examples): - tokenized_data = self.tokenizer( - examples["text"], - padding="max_length", - truncation=True, - max_length=self.max_length, - ) - tokenized_data["label"] = [self.label_to_id[label] for label in examples["label"]] - return tokenized_data - - try: - logging.info(f"Loading dataset from {dataset_path}") - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): - # Load dataset saved by Hugging Face datasets library - return load_from_disk(dataset_path).map(tokenize_function, batched=True) - else: - data = [] - for filename in os.listdir(dataset_path): - filepath = os.path.join(dataset_path, filename) - if filename.endswith(".jsonl"): - with open(filepath, "r") as f: - for line in f: - example = json.loads(line) - data.append(example) - - elif filename.endswith(".csv"): - df = pd.read_csv(filepath) - data.extend(df.to_dict("records")) - - elif filename.endswith(".parquet"): - df = pq.read_table(filepath).to_pandas() - data.extend(df.to_dict("records")) - - elif filename.endswith(".json"): - with open(filepath, "r") as f: - json_data = json.load(f) - data.extend(json_data) - - elif filename.endswith(".xml"): - tree = ET.parse(filepath) - root = tree.getroot() - for record in root.findall("record"): - text = record.find("text").text # type: ignore - label = record.find("label").text # type: ignore - data.append({"text": text, "label": label}) - - elif filename.endswith(".yaml") or filename.endswith(".yml"): - with open(filepath, "r") as f: - yaml_data = yaml.safe_load(f) - data.extend(yaml_data) - - elif filename.endswith(".tsv"): - df = pd.read_csv(filepath, sep="\t") - data.extend(df.to_dict("records")) - - elif filename.endswith((".xls", ".xlsx")): - df = pd.read_excel(filepath) - data.extend(df.to_dict("records")) - - elif filename.endswith(".db"): - conn = sqlite3.connect(filepath) - query = "SELECT text, label FROM dataset_table;" - df = pd.read_sql_query(query, conn) - data.extend(df.to_dict("records")) - - elif filename.endswith(".feather"): - df = feather.read_feather(filepath) - data.extend(df.to_dict("records")) - - # Create label_to_id mapping and save it in model config - unique_labels = (example["label"] for example in data) - self.label_to_id = {label: i for i, label in enumerate(unique_labels)} - if self.model: - if self.model.config.label2id != self.label_to_id: - self.log.warning("New labels detected, ignore if fine-tuning") - self.model.config.label2id = self.label_to_id - self.model.config.id2label = {i: label for label, i in self.label_to_id.items()} - - return Dataset.from_pandas(pd.DataFrame(data)).map(tokenize_function, batched=True) - except Exception as e: - logging.error(f"Error occurred when loading dataset from {dataset_path}. Error: {e}") - raise diff --git a/huggingface/sentiment_analysis.py b/huggingface/sentiment_analysis.py deleted file mode 100644 index 889cb13..0000000 --- a/huggingface/sentiment_analysis.py +++ /dev/null @@ -1,256 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import os -import sqlite3 -import xml.etree.ElementTree as ET -from typing import Any, Dict, List, Union - -import pandas as pd -import torch -import yaml # type: ignore -from datasets import Dataset, DatasetDict, load_from_disk -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import DataCollatorWithPadding - -from .base import HuggingFaceFineTuner - - -class HuggingFaceSentimentAnalysisFineTuner(HuggingFaceFineTuner): - r""" - A bolt for fine-tuning Hugging Face models on sentiment analysis tasks. - - Args: - input (BatchInput): The batch input data. - output (OutputConfig): The output data. - state (State): The state manager. - - ## Using Command Line - ```bash - genius HuggingFaceSentimentAnalysisFineTuner rise \ - streaming \ - --input_kafka_topic webhook_test \ - --input_kafka_cluster_connection_string localhost:9094 \ - --input_kafka_consumer_group_id geniusrise \ - streaming \ - --output_kafka_topic webhook_test \ - --output_kafka_cluster_connection_string localhost:9094 \ - postgres \ - --postgres_host 127.0.0.1 \ - --postgres_port 5432 \ - --postgres_user postgres \ - --postgres_password postgres \ - --postgres_database geniusrise \ - --postgres_table state \ - listen \ - --args various=30 arguments=40 that=50 this=70 bolt=63 may=lol have='{"lol": "lel"}' - ``` - - ## Using YAML File - ```yaml - version: "1" - bolts: - my_fine_tuner: - name: "HuggingFaceSentimentAnalysisFineTuner" - method: "load_dataset" - args: - dataset_path: "/path/to/dataset" - input: - type: "batch" - args: - bucket: "my-bucket" - folder: "my-folder" - output: - type: "streaming" - args: - output_topic: "webhook_test" - kafka_servers: "localhost:9094" - state: - type: "postgres" - args: - postgres_host: "127.0.0.1" - postgres_port: 5432 - postgres_user: "postgres" - postgres_password: "postgres" - postgres_database: "geniusrise" - postgres_table: "state" - deploy: - type: "k8s" - args: - name: "my_fine_tuner" - namespace: "default" - image: "my_fine_tuner_image" - replicas: 1 - ``` - - Args: - model: The pre-trained model to fine-tune. - tokenizer: The tokenizer associated with the model. - input (BatchInput): The batch input data. - output (OutputConfig): The output data. - state (State): The state manager. - """ - - def load_dataset(self, dataset_path: str, **kwargs: Any) -> Dataset | DatasetDict: - r""" - Load a dataset from a directory. - - Args: - dataset_path (str): The path to the dataset directory. - **kwargs: Additional keyword arguments. - - Returns: - Dataset | DatasetDict: The loaded dataset. - - ## Supported Data Formats and Structures: - - ### JSONL - Each line is a JSON object representing an example. - ```json - {"text": "The text content", "label": "The label"} - ``` - - ### CSV - Should contain 'text' and 'label' columns. - ```csv - text,label - "The text content","The label" - ``` - - ### Parquet - Should contain 'text' and 'label' columns. - - ### JSON - An array of dictionaries with 'text' and 'label' keys. - ```json - [{"text": "The text content", "label": "The label"}] - ``` - - ### XML - Each 'record' element should contain 'text' and 'label' child elements. - ```xml - - The text content - - - ``` - - ### YAML - Each document should be a dictionary with 'text' and 'label' keys. - ```yaml - - text: "The text content" - label: "The label" - ``` - - ### TSV - Should contain 'text' and 'label' columns separated by tabs. - - ### Excel (.xls, .xlsx) - Should contain 'text' and 'label' columns. - - ### SQLite (.db) - Should contain a table with 'text' and 'label' columns. - - ### Feather - Should contain 'text' and 'label' columns. - """ - if os.path.isfile(os.path.join(dataset_path, "dataset_info.json")): - dataset = load_from_disk(dataset_path) - else: - data = [] - for filename in os.listdir(dataset_path): - filepath = os.path.join(dataset_path, filename) - if filename.endswith(".jsonl"): - with open(filepath, "r") as f: - for line in f: - example = json.loads(line) - data.append(example) - elif filename.endswith(".csv"): - df = pd.read_csv(filepath) - data.extend(df.to_dict("records")) - elif filename.endswith(".parquet"): - df = pq.read_table(filepath).to_pandas() - data.extend(df.to_dict("records")) - elif filename.endswith(".json"): - with open(filepath, "r") as f: - json_data = json.load(f) - data.extend(json_data) - elif filename.endswith(".xml"): - tree = ET.parse(filepath) - root = tree.getroot() - for record in root.findall("record"): - text = record.find("text").text # type: ignore - label = record.find("label").text # type: ignore - data.append({"text": text, "label": label}) - elif filename.endswith(".yaml") or filename.endswith(".yml"): - with open(filepath, "r") as f: - yaml_data = yaml.safe_load(f) - data.extend(yaml_data) - elif filename.endswith(".tsv"): - df = pd.read_csv(filepath, sep="\t") - data.extend(df.to_dict("records")) - elif filename.endswith((".xls", ".xlsx")): - df = pd.read_excel(filepath) - data.extend(df.to_dict("records")) - elif filename.endswith(".db"): - conn = sqlite3.connect(filepath) - query = "SELECT text, label FROM dataset_table;" - df = pd.read_sql_query(query, conn) - data.extend(df.to_dict("records")) - elif filename.endswith(".feather"): - df = feather.read_feather(filepath) - data.extend(df.to_dict("records")) - dataset = Dataset.from_pandas(pd.DataFrame(data)) - - tokenized_dataset = dataset.map( - self.prepare_train_features, - batched=True, - remove_columns=dataset.column_names, - ) - return tokenized_dataset - - def prepare_train_features(self, examples: Dict[str, Union[str, int]]) -> Dict[str, Union[List[int], int]]: - """ - Tokenize the examples and prepare the features for training. - - Args: - examples (Dict[str, Union[str, int]]): A dictionary of examples. - - Returns: - Dict[str, Union[List[int], int]]: The processed features. - """ - if not self.tokenizer: - raise Exception("No tokenizer found, please call load_models first.") - - tokenized_inputs = self.tokenizer(examples["text"], truncation=True, padding=False) - tokenized_inputs["labels"] = examples["label"] - return tokenized_inputs - - def data_collator( - self, examples: List[Dict[str, Union[List[int], int]]] - ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: - """ - Customize the data collator. - - Args: - examples (List[Dict[str, Union[List[int], int]]]): The examples to collate. - - Returns: - Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: The collated data. - """ - return DataCollatorWithPadding(self.tokenizer)(examples) diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..83bf3d1 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,334 @@ +absl-py==2.1.0 +accelerate==0.26.1 +aiohttp==3.8.6 +aiosignal==1.3.1 +alembic==1.13.0 +annotated-types==0.6.0 +ansicolors==1.1.8 +anyio==4.1.0 +apache-airflow==2.8.0 +apache-airflow-providers-common-sql==1.9.0 +apache-airflow-providers-docker==3.8.2 +apache-airflow-providers-ftp==3.7.0 +apache-airflow-providers-http==4.8.0 +apache-airflow-providers-imap==3.5.0 +apache-airflow-providers-sqlite==3.6.0 +apache-beam==2.48.0 +apache-flink==1.18.0 +apache-flink-libraries==1.18.0 +apispec==6.3.0 +argcomplete==3.2.1 +argparse-color-formatter==1.2.2.post2 +argparse-manpage==4.4 +asgiref==3.7.2 +async-timeout==4.0.3 +attributedict==0.3.0 +attrs==23.1.0 +auto-gptq==0.6.0 +autoawq==0.1.8 +autocommand==2.2.2 +avro-python3==1.9.2.1 +Babel==2.14.0 +backoff==2.2.1 +bitsandbytes==0.42.0 +black==23.12.1 +bleach==6.0.0 +blessings==1.7 +blinker==1.7.0 +boto3==1.34.23 +botocore==1.34.23 +build==0.10.0 +cachelib==0.9.0 +cachetools==5.3.2 +certifi==2023.11.17 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.2 +cheroot==10.0.0 +CherryPy==18.9.0 +click==8.1.7 +clickclick==20.10.2 +cloudpickle==2.2.1 +codecov==2.1.13 +colorama==0.4.6 +coloredlogs==15.0.1 +colorlog==4.8.0 +colour-runner==0.1.1 +ConfigUpdater==3.2 +connexion==2.14.2 +coverage==7.3.4 +crcmod==1.7 +cron-descriptor==1.4.0 +croniter==2.0.1 +cryptography==41.0.7 +DataProperty==1.0.1 +datasets==2.16.1 +deepdiff==6.7.1 +Deprecated==1.2.14 +dill==0.3.7 +direnv==2020.12.3 +distlib==0.3.8 +dnspython==2.4.2 +docker==7.0.0 +docopt==0.6.2 +docstring-parser==0.15 +docutils==0.20.1 +einops==0.7.0 +email-validator==1.3.1 +emoji==2.7.0 +env-file==2020.12.3 +et-xmlfile==1.1.0 +evaluate==0.4.1 +exceptiongroup==1.2.0 +fastavro==1.4.7 +fasteners==0.19 +fastjsonschema==2.19.1 +filelock==3.13.1 +find-libpython==0.3.0 +flake8==6.1.0 +flash-attn==2.4.2 +Flask==2.2.5 +Flask-AppBuilder==4.3.10 +Flask-Babel==2.0.0 +Flask-Caching==2.1.0 +Flask-JWT-Extended==4.6.0 +Flask-Limiter==3.5.0 +Flask-Login==0.6.3 +Flask-Session==0.5.0 +Flask-SQLAlchemy==2.5.1 +Flask-WTF==1.2.1 +frozenlist==1.4.1 +fsspec==2023.10.0 +gekko==1.0.6 +geniusrise==0.0.33 +google-auth==2.17.3 +google-re2==1.1 +googleapis-common-protos==1.62.0 +GPUtil==1.4.0 +graphviz==0.20.1 +greenlet==3.0.2 +grpcio==1.60.0 +gunicorn==21.2.0 +h11==0.14.0 +hdfs==2.7.2 +httpcore==0.16.3 +httplib2==0.20.4 +httpx==0.23.3 +huggingface-hub==0.20.2 +humanfriendly==10.0 +idna==3.6 +importlib-metadata==6.11.0 +importlib-resources==6.1.1 +inflect==7.0.0 +inflection==0.5.1 +iniconfig==2.0.0 +inspecta==0.1.3 +itsdangerous==2.1.2 +jaraco.classes==3.3.0 +jaraco.collections==5.0.0 +jaraco.context==4.3.0 +jaraco.functools==4.0.0 +jaraco.text==3.12.0 +jeepney==0.8.0 +Jinja2==3.1.2 +jmespath==0.10.0 +joblib==1.3.2 +jsonlines==4.0.0 +jsonpickle==3.0.1 +jsonschema==4.20.0 +jsonschema-specifications==2023.11.2 +jupyter_core==5.7.1 +kafka-python==2.0.2 +keyring==24.2.0 +kubernetes==28.1.0 +lazy-object-proxy==1.10.0 +limits==3.7.0 +linkify-it-py==2.0.2 +lm_eval==0.4.0 +lockfile==0.12.2 +lxml==5.1.0 +Mako==1.3.0 +Markdown==3.5.1 +markdown-it-py==3.0.0 +MarkupSafe==2.1.3 +marshmallow==3.20.1 +marshmallow-oneofschema==3.0.1 +marshmallow-sqlalchemy==0.26.1 +mbstrdecoder==1.1.3 +mccabe==0.7.0 +mdit-py-plugins==0.4.0 +mdurl==0.1.2 +more-itertools==10.1.0 +mpmath==1.3.0 +multidict==6.0.4 +multiprocess==0.70.15 +mypy==1.8.0 +mypy-extensions==1.0.0 +nbformat==5.9.2 +networkx==3.2.1 +ninja==1.11.1.1 +nltk==3.8.1 +numexpr==2.8.8 +numpy==1.26.3 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.18.1 +nvidia-nvjitlink-cu12==12.3.101 +nvidia-nvtx-cu12==12.1.105 +oauthlib==3.2.2 +objsize==0.6.1 +openpyxl==3.1.2 +opentelemetry-api==1.22.0 +opentelemetry-exporter-otlp==1.22.0 +opentelemetry-exporter-otlp-proto-common==1.22.0 +opentelemetry-exporter-otlp-proto-grpc==1.22.0 +opentelemetry-exporter-otlp-proto-http==1.22.0 +opentelemetry-proto==1.22.0 +opentelemetry-sdk==1.22.0 +opentelemetry-semantic-conventions==0.43b0 +ordered-set==4.1.0 +orjson==3.9.7 +packaging==23.2 +pandas==1.3.5 +pathspec==0.12.1 +pathvalidate==3.2.0 +peft==0.7.1 +pemja==0.3.0 +pendulum==2.1.2 +pillow==10.2.0 +pip-autoremove==0.10.0 +pipdeptree==2.13.1 +pkginfo==1.9.6 +platformdirs==4.1.0 +pluggy==1.3.0 +portalocker==2.8.2 +portend==3.2.0 +prettytable==3.9.0 +prison==0.2.1 +prometheus-client==0.19.0 +proto-plus==1.22.3 +protobuf==4.23.4 +psutil==5.9.6 +psycopg2==2.9.9 +py4j==0.10.9.7 +pyarrow==8.0.0 +pyarrow-hotfix==0.6 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pybind11==2.11.1 +pycodestyle==2.11.0 +pycparser==2.21 +pydantic==2.5.2 +pydantic_core==2.14.5 +pydot==1.4.2 +pyflakes==3.1.0 +Pygments==2.17.2 +PyJWT==2.8.0 +pymongo==3.13.0 +pyparsing==3.1.1 +pyproject-api==1.6.1 +pyproject_hooks==1.0.0 +pyspark==3.5.0 +pytablewriter==1.2.0 +pytest==7.4.3 +pytest-asyncio==0.21.1 +python-daemon==3.0.1 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +python-nvd3==0.15.0 +python-slugify==8.0.1 +pytz==2023.3.post1 +pytzdata==2020.1 +PyYAML==6.0.1 +readme-renderer==40.0 +redis==5.0.1 +referencing==0.32.0 +regex==2023.8.8 +requests==2.31.0 +requests-oauthlib==1.3.1 +requests-toolbelt==1.0.0 +responses==0.18.0 +retrying==1.3.4 +rfc3339-validator==0.1.4 +rfc3986==1.5.0 +rich==13.7.0 +rich-argparse==1.4.0 +rootpath==0.1.1 +rouge==1.0.1 +rouge-score==0.1.2 +rpds-py==0.13.2 +rsa==4.9 +s3transfer==0.10.0 +sacrebleu==2.4.0 +sacremoses==0.1.1 +safetensors==0.4.1 +scikit-learn==1.4.0 +scipy==1.12.0 +SecretStorage==3.3.3 +sentence-transformers==2.2.2 +sentencepiece==0.1.99 +setproctitle==1.3.3 +shortuuid==1.0.11 +shtab==1.6.5 +six==1.16.0 +sniffio==1.3.0 +SQLAlchemy==1.4.50 +SQLAlchemy-JSONField==1.0.2 +SQLAlchemy-Utils==0.41.1 +sqlitedict==2.1.0 +sqlparse==0.4.4 +streamz==0.6.4 +sympy==1.12 +tabledata==1.3.3 +tabulate==0.9.0 +tcolorpy==0.1.4 +tempora==5.5.0 +tenacity==8.2.3 +termcolor==2.4.0 +text-unidecode==1.3 +texttable==1.7.0 +threadpoolctl==3.2.0 +tokenizers==0.15.0 +toml==0.10.2 +tomli==2.0.1 +toolz==0.12.0 +torch==2.1.2 +torchvision==0.16.2 +tornado==6.3.3 +tox==4.12.1 +tqdm==4.66.1 +tqdm-multiprocess==0.0.11 +traitlets==5.14.1 +transformers==4.36.2 +triton==2.1.0 +trl==0.7.10 +twine==4.0.2 +typepy==1.3.2 +typing_extensions==4.9.0 +tyro==0.6.6 +tzdata==2023.3 +uc-micro-py==1.0.2 +unicodecsv==0.14.1 +universal-pathlib==0.1.4 +urllib3==1.26.18 +values==2020.12.3 +virtualenv==20.25.0 +wcwidth==0.2.6 +webencodings==0.5.1 +websocket-client==1.6.1 +Werkzeug==2.2.3 +wrapt==1.16.0 +WTForms==3.1.1 +xxhash==3.4.1 +yarl==1.9.4 +zc.lockfile==3.0.post1 +zict==3.0.0 +zipp==3.17.0 +zstandard==0.21.0 diff --git a/requirements.txt b/requirements.txt index ef38b0f..69e3aa2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,147 +1,35 @@ -absl-py==1.4.0 -accelerate==0.22.0 -aiohttp==3.8.5 -aiosignal==1.3.1 -annotated-types==0.5.0 -ansicolors==1.1.8 -argparse-color-formatter==1.2.2.post2 -async-timeout==4.0.3 -attrs==23.1.0 -bleach==6.0.0 -boto3==1.28.25 -botocore==1.31.25 -build==0.10.0 -cachetools==5.3.1 -certifi==2023.7.22 -cffi==1.15.1 -charset-normalizer==3.2.0 -click==8.1.7 -cmake==3.27.2 -colorama==0.4.6 -colorlog==6.7.0 -coverage==7.3.0 -cryptography==41.0.3 -datasets==2.14.4 -dill==0.3.7 -direnv==2020.12.3 -docutils==0.20.1 -emoji==2.7.0 -env-file==2020.12.3 -et-xmlfile==1.1.0 -evaluate==0.4.0 -exceptiongroup==1.1.2 -filelock==3.12.2 -flake8==6.1.0 -frozenlist==1.4.0 -fsspec==2023.6.0 -geniusrise==0.0.1 -google-auth==2.17.3 -huggingface-hub==0.16.4 -idna==3.4 -importlib-metadata==6.8.0 -iniconfig==2.0.0 -jaraco.classes==3.3.0 -jeepney==0.8.0 -Jinja2==3.1.2 -jmespath==0.10.0 -joblib==1.3.2 -jsonpickle==3.0.1 -kafka-python==2.0.2 -keyring==24.2.0 -kubernetes==27.2.0 -lit==16.0.6 -lxml==4.9.3 -markdown-it-py==3.0.0 -MarkupSafe==2.1.3 -mccabe==0.7.0 -mdurl==0.1.2 -more-itertools==10.1.0 -mpmath==1.3.0 -multidict==6.0.4 -multiprocess==0.70.15 -mypy==1.5.0 -mypy-extensions==1.0.0 -networkx==3.1 -nltk==3.8.1 -numpy==1.25.2 -nvidia-cublas-cu11==11.10.3.66 -nvidia-cuda-cupti-cu11==11.7.101 -nvidia-cuda-nvrtc-cu11==11.7.99 -nvidia-cuda-runtime-cu11==11.7.99 -nvidia-cudnn-cu11==8.5.0.96 -nvidia-cufft-cu11==10.9.0.58 -nvidia-curand-cu11==10.2.10.91 -nvidia-cusolver-cu11==11.4.0.1 -nvidia-cusparse-cu11==11.7.4.91 -nvidia-nccl-cu11==2.14.3 -nvidia-nvtx-cu11==11.7.91 -oauthlib==3.2.2 -openpyxl==3.1.2 -packaging==23.1 -pandas==2.0.3 -pkginfo==1.9.6 -pluggy==1.2.0 -portalocker==2.7.0 -prettytable==3.8.0 -psutil==5.9.5 -psycopg2==2.9.7 -pyarrow==13.0.0 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycodestyle==2.11.0 -pycparser==2.21 -pydantic==2.1.1 -pydantic_core==2.4.0 -pyflakes==3.1.0 -Pygments==2.16.1 -pyproject_hooks==1.0.0 -pytest==7.4.0 -pytest-cov==4.1.0 -python-dateutil==2.8.2 -pytz==2023.3 -PyYAML==6.0.1 -readme-renderer==40.0 -redis==4.6.0 -regex==2023.8.8 -requests==2.31.0 -requests-oauthlib==1.3.1 -requests-toolbelt==1.0.0 -responses==0.18.0 -retrying==1.3.4 -rfc3986==2.0.0 -rich==13.5.2 -rich-argparse==1.3.0 +geniusrise==0.1.7 +torch==2.1.2 +accelerate==0.27.2 +transformers==4.38.1 +evaluate==0.4.1 +datasets==2.16.1 +autoawq==0.2.2 +auto-gptq==0.7.0 +bitsandbytes==0.42.0 +CherryPy==18.9.0 +flash-attn==2.4.2 +peft==0.7.1 +gunicorn==21.2.0 +trl==0.7.10 +sentence_transformers==2.2.2 +nbformat==5.9.2 +sacremoses==0.1.1 +sacrebleu==2.4.0 rouge-score==0.1.2 -rsa==4.9 -s3transfer==0.6.1 -sacrebleu==2.3.1 -sacremoses==0.0.53 -safetensors==0.3.3 -scikit-learn==1.3.0 -scipy==1.11.2 -SecretStorage==3.3.3 +rouge==1.0.1 +openpyxl==3.1.2 sentencepiece==0.1.99 -shortuuid==1.0.11 -six==1.16.0 -sympy==1.12 -tabulate==0.9.0 -termcolor==2.3.0 -threadpoolctl==3.2.0 -tokenizers==0.13.3 -tomli==2.0.1 -torch==2.0.1 -tqdm==4.66.1 -transformers==4.32.0 -triton==2.0.0 -twine==4.0.2 -types-PyYAML==6.0.12.11 -typing_extensions==4.7.1 -tzdata==2023.3 -urllib3==1.26.16 -values==2020.12.3 -wcwidth==0.2.6 -webencodings==0.5.1 -websocket-client==1.6.1 -xxhash==3.3.0 -yarl==1.9.2 -zipp==3.16.2 +safetensors==0.4.1 +einops==0.7.0 +packaging==23.2 +optimum==1.16.2 +textwrap3==0.9.2 +nltk==3.8.1 +pyarrow>=9.0.0 +absl-py==2.1.0 +pandas==1.3.5 +multiprocess==0.70.15 +optimum==1.16.2 +vllm==0.3.2 +llama_cpp_python==0.2.50 diff --git a/setup.cfg b/setup.cfg index c092ec3..54e5d5b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [flake8] exclude = */__init__.py,migrations/* -ignore = E111, E114, E121, E131, W503, F405, F403, E126, E501, F841, E124, E251, E203, C419, B008, W505, N805, A003, B006, A002, B009, N817, A001 +ignore = E111, E114, E121, E131, W503, F405, F403, E126, E501, F841, E124, E251, E203, C419, B008, W505, N805, A003, B006, A002, B009, N817, A001, N802, N806, E741, C416, C400 max-line-length = 120 max-doc-length = 120 show-source = true @@ -21,3 +21,7 @@ disallow_untyped_calls = False [tool:pytest] addopts = -p no:warnings ignore = tests + +[tool.ruff] +indent-width = 4 +line-length = 120 diff --git a/setup.py b/setup.py index 3509e5a..3b8e77d 100644 --- a/setup.py +++ b/setup.py @@ -3,21 +3,42 @@ with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() +with open("requirements.txt", "r", encoding="utf-8") as f: + requirements = f.read().splitlines() + setup( - name="geniusrise-huggingface", - version="0.1.0", + name="geniusrise-text", + version="0.1.12", packages=find_packages(exclude=["tests", "tests.*"]), - install_requires=[], + install_requires=requirements, python_requires=">=3.10", author="ixaxaar", author_email="ixaxaar@geniusrise.ai", description="Huggingface bolts for geniusrise", long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/ixaxaar/huggingface-bolts", + url="https://github.com/geniusrise/geniusrise-text", classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Topic :: Software Development :: Build Tools", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Operating System :: OS Independent", ], + keywords="mlops, llm, geniusrise, machine learning, data processing", + project_urls={ + "Bug Reports": "https://github.com/geniusrise/geniusrise-text/issues", + "Source": "https://github.com/geniusrise/geniusrise-text", + "Documentation": "https://docs.geniusrise.ai/", + }, + package_data={ + "geniusrise": [], + }, + extras_require={ + "dev": ["check-manifest"], + "test": ["coverage"], + }, ) diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index 7e7658f..0000000 --- a/tests/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . diff --git a/tests/test_base.py b/tests/test_base.py deleted file mode 100644 index c76dc7d..0000000 --- a/tests/test_base.py +++ /dev/null @@ -1,127 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import os -import tempfile - -import numpy as np -import pytest -from datasets import load_dataset -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from transformers import EvalPrediction - -from huggingface.base import HuggingFaceFineTuner - - -class TestHuggingFaceFineTuner(HuggingFaceFineTuner): - def load_dataset(self, dataset_path, **kwargs): - dataset = load_dataset("glue", "mrpc", split="train[:100]") - dataset = dataset.map( - lambda examples: self.tokenizer( - examples["sentence1"], - examples["sentence2"], - truncation=True, - padding="max_length", - max_length=512, - ), - batched=True, - ).map(lambda examples: {"labels": examples["label"]}, batched=True) - print(dataset) - return dataset - - -@pytest.fixture -def bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - - return TestHuggingFaceFineTuner( - input=input, - output=output, - state=state, - eval=False, - ) - - -def test_bolt_init(bolt): - assert bolt.input is not None - assert bolt.output is not None - assert bolt.state is not None - - -def test_load_dataset(bolt): - bolt.model_name = "bert-base-uncased" - bolt.tokenizer_name = "bert-base-uncased" - bolt.model_class = "BertForSequenceClassification" - bolt.tokenizer_class = "BertTokenizer" - bolt.load_models() - dataset = bolt.load_dataset("fake_path") - assert dataset is not None - assert len(dataset) == 100 - - -def test_fine_tune(bolt): - bolt.fine_tune( - model_name="bert-base-uncased", - tokenizer_name="bert-base-uncased", - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BertForSequenceClassification", - tokenizer_class="BertTokenizer", - eval=False, - ) - - # Check that model files are created in the output directory - assert os.path.isfile(os.path.join(bolt.output.output_folder, "model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(bolt.output.output_folder, "model", "config.json")) - assert os.path.isfile(os.path.join(bolt.output.output_folder, "model", "training_args.bin")) - - -def test_compute_metrics(bolt): - # Mocking an EvalPrediction object - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - - metrics = bolt.compute_metrics(eval_pred) - - assert "accuracy" in metrics - assert "precision" in metrics - assert "recall" in metrics - assert "f1" in metrics - - -def test_upload_to_hf_hub(bolt): - bolt.fine_tune( - model_name="bert-base-uncased", - tokenizer_name="bert-base-uncased", - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BertForSequenceClassification", - tokenizer_class="BertTokenizer", - eval=False, - hf_repo_id="ixaxaar/geniusrise-hf-base-test-repo", - hf_commit_message="testing base fine tuner", - hf_token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"), - hf_private=False, - hf_create_pr=True, - ) - - assert True diff --git a/tests/test_commonsense_reasoning.py b/tests/test_commonsense_reasoning.py deleted file mode 100644 index 545a0d2..0000000 --- a/tests/test_commonsense_reasoning.py +++ /dev/null @@ -1,172 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import os -import sqlite3 -import tempfile -import xml.etree.ElementTree as ET - -import numpy as np -import pandas as pd -import pytest -import yaml # type: ignore -from datasets import Dataset -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import EvalPrediction - -from huggingface import HuggingFaceCommonsenseReasoningFineTuner - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [{"premise": f"premise_{i}", "hypothesis": f"hypothesis_{i}", "label": i % 2} for i in range(10)] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "premise").text = item["premise"] - ET.SubElement(record, "hypothesis").text = item["hypothesis"] - ET.SubElement(record, "label").text = str(item["label"]) - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - df.to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - - -# Fixtures for each file type -@pytest.fixture( - params=[ - "huggingface", - "csv", - "jsonl", - "parquet", - "json", - "xml", - "yaml", - "tsv", - "xlsx", - "db", - "feather", - ] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/eval", ext) - return tmpdir, ext - - -@pytest.fixture -def commonsense_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - klass = HuggingFaceCommonsenseReasoningFineTuner( - input=input, - output=output, - state=state, - ) - klass.model_class = "BertForSequenceClassification" - klass.model_name = "bert-base-uncased" - klass.tokenizer_class = "BertTokenizer" - klass.tokenizer_name = "bert-base-uncased" - return klass - - -def test_commonsense_bolt_init(commonsense_bolt): - commonsense_bolt.load_models() - - assert commonsense_bolt.model is not None - assert commonsense_bolt.tokenizer is not None - assert commonsense_bolt.input is not None - assert commonsense_bolt.output is not None - assert commonsense_bolt.state is not None - - -def test_load_dataset_all_formats(commonsense_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - - commonsense_bolt.load_models() - dataset = commonsense_bolt.load_dataset(dataset_path) - assert dataset is not None - assert len(dataset) == 10 - - -# Test for fine-tuning -def test_commonsense_bolt_fine_tune(commonsense_bolt, dataset_file): - tmpdir, ext = dataset_file - commonsense_bolt.input.input_folder = tmpdir - - commonsense_bolt.fine_tune( - model_name="bert-base-uncased", - tokenizer_name="bert-base-uncased", - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BertForSequenceClassification", - tokenizer_class="BertTokenizer", - eval=True, - ) - - output_dir = commonsense_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -# Test for computing metrics -def test_commonsense_bolt_compute_metrics(commonsense_bolt): - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = commonsense_bolt.compute_metrics(eval_pred) - assert "accuracy" in metrics - assert "precision" in metrics - assert "recall" in metrics - assert "f1" in metrics diff --git a/tests/test_instruction_tuning.py b/tests/test_instruction_tuning.py deleted file mode 100644 index eadf941..0000000 --- a/tests/test_instruction_tuning.py +++ /dev/null @@ -1,169 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import os -import sqlite3 -import tempfile -import xml.etree.ElementTree as ET - -import numpy as np -import pandas as pd -import pytest -import yaml # type: ignore -from datasets import Dataset -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import EvalPrediction - -from huggingface import HuggingFaceInstructionTuningFineTuner - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [{"instruction": f"instruction_{i}", "output": f"output_{i}"} for i in range(10)] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "instruction").text = item["instruction"] - ET.SubElement(record, "output").text = item["output"] - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - df.to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - - -# Fixtures for each file type -@pytest.fixture( - params=[ - "huggingface", - "csv", - "jsonl", - "parquet", - "json", - "xml", - "yaml", - "tsv", - "xlsx", - "db", - "feather", - ] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/eval", ext) - return tmpdir, ext - - -@pytest.fixture -def instruction_tuning_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - klass = HuggingFaceInstructionTuningFineTuner( - input=input, - output=output, - state=state, - ) - klass.model_class = "BartForConditionalGeneration" - klass.model_name = "facebook/bart-base" - klass.tokenizer_class = "BartTokenizer" - klass.tokenizer_name = "facebook/bart-base" - return klass - - -def test_instruction_tuning_bolt_init(instruction_tuning_bolt): - instruction_tuning_bolt.load_models() - - assert instruction_tuning_bolt.model is not None - assert instruction_tuning_bolt.tokenizer is not None - assert instruction_tuning_bolt.input is not None - assert instruction_tuning_bolt.output is not None - assert instruction_tuning_bolt.state is not None - - -def test_load_dataset_all_formats(instruction_tuning_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - - instruction_tuning_bolt.load_models() - dataset = instruction_tuning_bolt.load_dataset(dataset_path) - assert dataset is not None - assert len(dataset) == 10 - - -# Test for fine-tuning -def test_instruction_tuning_bolt_fine_tune(instruction_tuning_bolt, dataset_file): - tmpdir, ext = dataset_file - instruction_tuning_bolt.input.input_folder = tmpdir - - instruction_tuning_bolt.fine_tune( - model_name="facebook/bart-base", - tokenizer_name="facebook/bart-base", - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BartForConditionalGeneration", - tokenizer_class="BartTokenizer", - eval=True, - ) - - output_dir = instruction_tuning_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -# Test for computing metrics -def test_instruction_tuning_bolt_compute_metrics(instruction_tuning_bolt): - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - instruction_tuning_bolt.load_models() - metrics = instruction_tuning_bolt.compute_metrics(eval_pred) - assert "bleu" in metrics diff --git a/tests/test_language_model.py b/tests/test_language_model.py deleted file mode 100644 index d0e30c6..0000000 --- a/tests/test_language_model.py +++ /dev/null @@ -1,165 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import os -import sqlite3 -import tempfile -import xml.etree.ElementTree as ET - -import numpy as np -import pandas as pd -import pytest -import yaml # type: ignore -from datasets import Dataset -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import EvalPrediction - -from huggingface.language_model import HuggingFaceLanguageModelingFineTuner - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [{"text": f"text_{i}"} for i in range(10)] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "text").text = item["text"] - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - df.to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - - -# Fixtures for each file type -@pytest.fixture( - params=[ - "huggingface", - "csv", - "json", - "jsonl", - "parquet", - "xml", - "yaml", - "tsv", - "xlsx", - "db", - "feather", - ] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/test", ext) - return tmpdir, ext - - -@pytest.fixture -def language_modeling_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - klass = HuggingFaceLanguageModelingFineTuner( - input=input, - output=output, - state=state, - ) - klass.model_class = "BertForMaskedLM" - klass.tokenizer_class = "BertTokenizer" - klass.model_name = "bert-base-uncased" - klass.tokenizer_name = "bert-base-uncased" - return klass - - -def test_language_modeling_bolt_init(language_modeling_bolt): - language_modeling_bolt.load_models() - assert language_modeling_bolt.model is not None - assert language_modeling_bolt.tokenizer is not None - assert language_modeling_bolt.input is not None - assert language_modeling_bolt.output is not None - assert language_modeling_bolt.state is not None - - -def test_load_dataset_all_formats(language_modeling_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - language_modeling_bolt.load_models() - dataset = language_modeling_bolt.load_dataset(dataset_path) - assert dataset is not None - assert len(dataset) == 10 - - -def test_language_modeling_bolt_fine_tune(language_modeling_bolt, dataset_file): - tmpdir, ext = dataset_file - language_modeling_bolt.input.input_folder = tmpdir - - language_modeling_bolt.fine_tune( - model_name="bert-base-uncased", - tokenizer_name="bert-base-uncased", - model_class="BertForMaskedLM", - tokenizer_class="AutoTokenizer", - num_train_epochs=1, - per_device_train_batch_size=1, - data_masked=True, - ) - output_dir = language_modeling_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -def test_language_modeling_bolt_compute_metrics(language_modeling_bolt): - language_modeling_bolt.load_models() - - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([[0, 1], [1, 0]]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = language_modeling_bolt.compute_metrics(eval_pred) - assert "bleu" in metrics - assert "sacrebleu" in metrics diff --git a/tests/test_ner.py b/tests/test_ner.py deleted file mode 100644 index 7fe34ff..0000000 --- a/tests/test_ner.py +++ /dev/null @@ -1,173 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import os -import sqlite3 -import tempfile -import xml.etree.ElementTree as ET - -import numpy as np -import pandas as pd -import pytest -import yaml # type: ignore -from datasets import Dataset -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import EvalPrediction - -from huggingface.ner import HuggingFaceNamedEntityRecognitionFineTuner - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [{"tokens": ["This", "is", "a", "test"], "ner_tags": [0, 1, 0, 1]} for _ in range(10)] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "tokens").text = " ".join(item["tokens"]) - ET.SubElement(record, "ner_tags").text = " ".join(map(str, item["ner_tags"])) - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - df.to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - - -# Fixtures for each file type -@pytest.fixture( - params=[ - "huggingface", - "csv", - "jsonl", - "parquet", - "json", - "xml", - "yaml", - "tsv", - "xlsx", - "feather", - ] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/eval", ext) - return tmpdir, ext - - -@pytest.fixture -def ner_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - - klass = HuggingFaceNamedEntityRecognitionFineTuner( - input=input, - output=output, - state=state, - ) - klass.model_class = "BertForTokenClassification" - klass.model_name = "bert-base-uncased" - klass.tokenizer_class = "BertTokenizerFast" - klass.tokenizer_name = "bert-base-uncased" - - return klass - - -def test_ner_bolt_init(ner_bolt): - ner_bolt.load_models() - - assert ner_bolt.model is not None - assert ner_bolt.tokenizer is not None - assert ner_bolt.input is not None - assert ner_bolt.output is not None - assert ner_bolt.state is not None - - -def test_load_dataset_all_formats(ner_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - - ner_bolt.load_models() - dataset = ner_bolt.load_dataset(dataset_path, label_list=[0, 1]) - assert dataset is not None - assert len(dataset) == 10 - - -# Test for fine-tuning -def test_ner_bolt_fine_tune(ner_bolt, dataset_file): - tmpdir, ext = dataset_file - ner_bolt.input.input_folder = tmpdir - - ner_bolt.fine_tune( - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BertForTokenClassification", - model_name="bert-base-uncased", - tokenizer_class="BertTokenizerFast", - tokenizer_name="bert-base-uncased", - data_label_list=[0, 1], - ) - - output_dir = ner_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -# Test for computing metrics -def test_ner_bolt_compute_metrics(ner_bolt): - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = ner_bolt.compute_metrics(eval_pred) - assert "accuracy" in metrics - assert "precision" in metrics - assert "recall" in metrics - assert "f1" in metrics diff --git a/tests/test_question_answering.py b/tests/test_question_answering.py deleted file mode 100644 index cee7615..0000000 --- a/tests/test_question_answering.py +++ /dev/null @@ -1,162 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import os -import tempfile -import pytest -import numpy as np -import pandas as pd -import json -import sqlite3 -import xml.etree.ElementTree as ET -import yaml # type: ignore -from datasets import Dataset -from pyarrow import feather, parquet as pq -from huggingface import HuggingFaceQuestionAnsweringFineTuner -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from transformers import EvalPrediction - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [ - { - "context": f"context_{i}", - "question": f"question_{i}", - "answers": [{"answer_start": [0], "text": [f"answer_{i}"]}], - } - for i in range(10) - ] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "context").text = item["context"] - ET.SubElement(record, "question").text = item["question"] - ET.SubElement(record, "answers").text = str(item["answers"]) - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - df["answers"] = df["answers"].apply(str) - df.to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - - -# Fixtures for each file type -@pytest.fixture( - params=["db", "xml", "csv", "huggingface", "jsonl", "parquet", "json", "yaml", "tsv", "xlsx", "feather"] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/eval", ext) - return tmpdir, ext - - -@pytest.fixture -def question_answering_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - klass = HuggingFaceQuestionAnsweringFineTuner( - input=input, - output=output, - state=state, - ) - klass.model_class = "BertForQuestionAnswering" - klass.model_name = "bert-base-uncased" - klass.tokenizer_class = "BertTokenizerFast" - klass.tokenizer_name = "bert-base-uncased" - return klass - - -def test_question_answering_bolt_init(question_answering_bolt): - question_answering_bolt.load_models() - - assert question_answering_bolt.model is not None - assert question_answering_bolt.tokenizer is not None - assert question_answering_bolt.input is not None - assert question_answering_bolt.output is not None - assert question_answering_bolt.state is not None - - -def test_load_dataset_all_formats(question_answering_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - - question_answering_bolt.load_models() - dataset = question_answering_bolt.load_dataset(dataset_path) - assert dataset is not None - assert len(dataset) == 10 - - -# Test for fine-tuning -def test_question_answering_bolt_fine_tune(question_answering_bolt, dataset_file): - tmpdir, ext = dataset_file - question_answering_bolt.input.input_folder = tmpdir - - question_answering_bolt.fine_tune( - model_name="bert-base-uncased", - tokenizer_name="bert-base-uncased", - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BertForQuestionAnswering", - tokenizer_class="BertTokenizerFast", - eval=True, - ) - - output_dir = question_answering_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -# Test for computing metrics -def test_question_answering_bolt_compute_metrics(question_answering_bolt): - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = question_answering_bolt.compute_metrics(eval_pred) - assert "accuracy" in metrics diff --git a/tests/test_sentiment_analysis.py b/tests/test_sentiment_analysis.py deleted file mode 100644 index 08e2940..0000000 --- a/tests/test_sentiment_analysis.py +++ /dev/null @@ -1,169 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import os -import sqlite3 -import tempfile -import xml.etree.ElementTree as ET - -import numpy as np -import pandas as pd -import pytest -import yaml # type: ignore -from datasets import Dataset -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import EvalPrediction - -from huggingface.sentiment_analysis import HuggingFaceSentimentAnalysisFineTuner - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [{"text": f"text_{i}", "label": i % 2} for i in range(10)] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "text").text = item["text"] - ET.SubElement(record, "label").text = str(item["label"]) - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - df.to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - - -# Fixtures for each file type -@pytest.fixture( - params=[ - "huggingface", - "csv", - "json", - "jsonl", - "parquet", - "xml", - "yaml", - "tsv", - "xlsx", - "db", - "feather", - ] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/eval", ext) - return tmpdir, ext - - -@pytest.fixture -def sentiment_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - klass = HuggingFaceSentimentAnalysisFineTuner( - input=input, - output=output, - state=state, - ) - klass.model_name = "bert-base-uncased" - klass.tokenizer_name = "bert-base-uncased" - klass.model_class = "BertForSequenceClassification" - klass.tokenizer_class = "BertTokenizer" - return klass - - -def test_sentiment_bolt_init(sentiment_bolt): - sentiment_bolt.load_models() - assert sentiment_bolt.model is not None - assert sentiment_bolt.tokenizer is not None - assert sentiment_bolt.input is not None - assert sentiment_bolt.output is not None - assert sentiment_bolt.state is not None - - -def test_load_dataset_all_formats(sentiment_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - sentiment_bolt.load_models() - dataset = sentiment_bolt.load_dataset(dataset_path) - assert dataset is not None - assert len(dataset) == 10 - - -def test_sentiment_bolt_fine_tune(sentiment_bolt, dataset_file): - tmpdir, ext = dataset_file - sentiment_bolt.input.input_folder = tmpdir - - sentiment_bolt.fine_tune( - model_name="bert-base-uncased", - tokenizer_name="bert-base-uncased", - num_train_epochs=1, - per_device_train_batch_size=1, - model_class="BertForSequenceClassification", - tokenizer_class="BertTokenizer", - eval=True, - ) - - output_dir = sentiment_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -def test_sentiment_bolt_compute_metrics(sentiment_bolt): - sentiment_bolt.load_models() - - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = sentiment_bolt.compute_metrics(eval_pred) - assert "accuracy" in metrics - assert "precision" in metrics - assert "recall" in metrics - assert "f1" in metrics diff --git a/tests/test_summarization.py b/tests/test_summarization.py deleted file mode 100644 index 78c5a0d..0000000 --- a/tests/test_summarization.py +++ /dev/null @@ -1,151 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import os -import tempfile -import pytest -import numpy as np -from datasets import Dataset -import pandas as pd -from huggingface.summarization import HuggingFaceSummarizationFineTuner -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from transformers import EvalPrediction -import json -import sqlite3 -import xml.etree.ElementTree as ET -import yaml # type: ignore -from pyarrow import feather, parquet as pq - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [{"document": f"document_{i}", "summary": f"summary_{i}"} for i in range(10)] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "document").text = item["document"] - ET.SubElement(record, "summary").text = item["summary"] - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - df.to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - - -# Fixtures for each file type -@pytest.fixture( - params=["huggingface", "csv", "json", "jsonl", "parquet", "xml", "yaml", "tsv", "xlsx", "db", "feather"] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/eval", ext) - return tmpdir, ext - - -@pytest.fixture -def summarization_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - klass = HuggingFaceSummarizationFineTuner( - input=input, - output=output, - state=state, - ) - klass.model_class = "BartForConditionalGeneration" - klass.tokenizer_class = "BartTokenizerFast" - klass.model_name = "facebook/bart-base" - klass.tokenizer_name = "facebook/bart-base" - return klass - - -def test_summarization_bolt_init(summarization_bolt): - summarization_bolt.load_models() - assert summarization_bolt.model is not None - assert summarization_bolt.tokenizer is not None - assert summarization_bolt.input is not None - assert summarization_bolt.output is not None - assert summarization_bolt.state is not None - - -def test_load_dataset_all_formats(summarization_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - summarization_bolt.load_models() - dataset = summarization_bolt.load_dataset(dataset_path) - assert dataset is not None - assert len(dataset) == 10 - - -def test_summarization_bolt_fine_tune(summarization_bolt, dataset_file): - tmpdir, ext = dataset_file - summarization_bolt.input.input_folder = tmpdir - - summarization_bolt.fine_tune( - model_name="facebook/bart-base", - tokenizer_name="facebook/bart-base", - model_class="BartForConditionalGeneration", - tokenizer_class="BartTokenizerFast", - num_train_epochs=1, - per_device_train_batch_size=1, - ) - output_dir = summarization_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -def test_summarization_bolt_compute_metrics(summarization_bolt): - summarization_bolt.load_models() - - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([[0, 1], [1, 0]]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = summarization_bolt.compute_metrics(eval_pred) - assert "rouge1" in metrics - assert "rouge2" in metrics - assert "rougeL" in metrics diff --git a/tests/test_translation.py b/tests/test_translation.py deleted file mode 100644 index a4d9f8a..0000000 --- a/tests/test_translation.py +++ /dev/null @@ -1,182 +0,0 @@ -# 🧠 Geniusrise -# Copyright (C) 2023 geniusrise.ai -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import os -import sqlite3 -import tempfile -import xml.etree.ElementTree as ET - -import numpy as np -import pandas as pd -import pytest -import yaml # type: ignore -from datasets import Dataset -from geniusrise.core import BatchInput, BatchOutput, InMemoryState -from pyarrow import feather -from pyarrow import parquet as pq -from transformers import EvalPrediction - -from huggingface import HuggingFaceTranslationFineTuner - - -# Helper function to create synthetic data in different formats -def create_dataset_in_format(directory, ext): - os.makedirs(directory, exist_ok=True) - data = [ - { - "translation": { - "en": f"This is a synthetic text example {i}", - "fr": f"C'est un exemple de texte synthétique {i}", - } - } - for i in range(10) - ] - df = pd.DataFrame(data) - - if ext == "huggingface": - dataset = Dataset.from_pandas(df) - dataset.save_to_disk(directory) - elif ext == "csv": - df.to_csv(os.path.join(directory, "data.csv"), index=False) - elif ext == "jsonl": - with open(os.path.join(directory, "data.jsonl"), "w") as f: - for item in data: - f.write(json.dumps(item) + "\n") - elif ext == "parquet": - pq.write_table(feather.Table.from_pandas(df), os.path.join(directory, "data.parquet")) - elif ext == "json": - with open(os.path.join(directory, "data.json"), "w") as f: - json.dump(data, f) - elif ext == "xml": - root = ET.Element("root") - for item in data: - record = ET.SubElement(root, "record") - ET.SubElement(record, "en").text = str(item["translation"]["en"]) - ET.SubElement(record, "fr").text = str(item["translation"]["fr"]) - tree = ET.ElementTree(root) - tree.write(os.path.join(directory, "data.xml")) - elif ext == "yaml": - with open(os.path.join(directory, "data.yaml"), "w") as f: - yaml.dump(data, f) - elif ext == "tsv": - df.to_csv(os.path.join(directory, "data.tsv"), index=False, sep="\t") - elif ext == "xlsx": - df.to_excel(os.path.join(directory, "data.xlsx"), index=False) - elif ext == "db": - conn = sqlite3.connect(os.path.join(directory, "data.db")) - ens = [x["translation"]["en"] for x in data] - frs = [x["translation"]["fr"] for x in data] - pd.DataFrame({"en": ens, "fr": frs}).to_sql("dataset_table", conn, if_exists="replace", index=False) - conn.close() - elif ext == "feather": - feather.write_feather(df, os.path.join(directory, "data.feather")) - else: - raise ValueError(f"Unsupported file extension: {ext}") - - -# Fixtures for each file type -@pytest.fixture( - params=[ - "huggingface", - "csv", - "json", - "jsonl", - "parquet", - "xml", - "yaml", - "tsv", - "xlsx", - "db", - "feather", - ] -) -def dataset_file(request, tmpdir): - ext = request.param - create_dataset_in_format(tmpdir + "/train", ext) - create_dataset_in_format(tmpdir + "/eval", ext) - return tmpdir, ext - - -@pytest.fixture -def translation_bolt(): - input_dir = tempfile.mkdtemp() - output_dir = tempfile.mkdtemp() - - input = BatchInput(input_dir, "geniusrise-test-bucket", "test-🤗-input") - output = BatchOutput(output_dir, "geniusrise-test-bucket", "test-🤗-output") - state = InMemoryState() - - klass = HuggingFaceTranslationFineTuner( - input=input, - output=output, - state=state, - eval=True, - ) - klass.model_class = "MarianMTModel" - klass.model_name = "Helsinki-NLP/opus-mt-en-fr" - klass.tokenizer_class = "MarianTokenizer" - klass.tokenizer_name = "Helsinki-NLP/opus-mt-en-fr" - return klass - - -def test_translation_bolt_init(translation_bolt): - translation_bolt.load_models() - - assert translation_bolt.model is not None - assert translation_bolt.tokenizer is not None - assert translation_bolt.input is not None - assert translation_bolt.output is not None - assert translation_bolt.state is not None - - -def test_load_dataset_all_formats(translation_bolt, dataset_file): - tmpdir, ext = dataset_file - dataset_path = os.path.join(tmpdir, "train") - - translation_bolt.load_models() - dataset = translation_bolt.load_dataset(dataset_path) - assert dataset is not None - assert len(dataset) == 10 - - -def test_translation_bolt_fine_tune(translation_bolt, dataset_file): - tmpdir, ext = dataset_file - translation_bolt.input.input_folder = tmpdir - - translation_bolt.fine_tune( - model_name="Helsinki-NLP/opus-mt-en-fr", - tokenizer_name="Helsinki-NLP/opus-mt-en-fr", - model_class="MarianMTModel", - tokenizer_class="MarianTokenizer", - num_train_epochs=1, - per_device_train_batch_size=1, - ) - output_dir = translation_bolt.output.output_folder - assert os.path.isfile(os.path.join(output_dir + "/model", "pytorch_model.bin")) - assert os.path.isfile(os.path.join(output_dir + "/model", "config.json")) - assert os.path.isfile(os.path.join(output_dir + "/model", "training_args.bin")) - - -def test_translation_bolt_compute_metrics(translation_bolt): - logits = np.array([[0.6, 0.4], [0.4, 0.6]]) - labels = np.array([0, 1]) - eval_pred = EvalPrediction(predictions=logits, label_ids=labels) - metrics = translation_bolt.compute_metrics(eval_pred) - assert "accuracy" in metrics - assert "precision" in metrics - assert "recall" in metrics - assert "f1" in metrics