Skip to content

Commit

Permalink
Windows support (partially) AutoRAG (#766)
Browse files Browse the repository at this point in the history
* change tempfile.NamedTemporaryFile to windows compatible

* change tempfile.TemporaryDirectory to windows compatible

* hotfix jina reranker error about event loop - You have to pass the event loop to the aiohttp.ClientSession()

* change file name to resolve the error about duplicate file name

* add description about pyopenssl, nltk, and windows + parsing installs

---------

Co-authored-by: jeffrey <vkefhdl1@gmail.com>
  • Loading branch information
vkehfdl1 and jeffrey authored Sep 30, 2024
1 parent 14ead70 commit db33d81
Show file tree
Hide file tree
Showing 35 changed files with 122 additions and 59 deletions.
2 changes: 1 addition & 1 deletion autorag/data/parse/table_hybrid_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def table_hybrid_parse(
:return: tuple of lists containing the parsed texts, path and pages.
"""
# make save folder directory
with tempfile.TemporaryDirectory() as save_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as save_dir:
text_dir = os.path.join(save_dir, "text")
table_dir = os.path.join(save_dir, "table")

Expand Down
2 changes: 1 addition & 1 deletion autorag/nodes/passagereranker/jina.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, project_dir: str, api_key: str = None, *args, **kwargs):
"API key is not provided."
"You can set it as an argument or as an environment variable 'JINAAI_API_KEY'"
)
self.session = aiohttp.ClientSession()
self.session = aiohttp.ClientSession(loop=get_event_loop())
self.session.headers.update(
{"Authorization": f"Bearer {api_key}", "Accept-Encoding": "identity"}
)
Expand Down
10 changes: 7 additions & 3 deletions autorag/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ def validate(self, yaml_path: str, qa_cnt: int = 5, random_state: int = 42):

# start Evaluate at temp project directory
with (
tempfile.NamedTemporaryFile(suffix=".parquet") as qa_path,
tempfile.NamedTemporaryFile(suffix=".parquet") as corpus_path,
tempfile.TemporaryDirectory() as temp_project_dir,
tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as qa_path,
tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as corpus_path,
tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_project_dir,
):
sample_qa_df.to_parquet(qa_path.name, index=False)
sample_corpus_df.to_parquet(corpus_path.name, index=False)
Expand All @@ -78,5 +78,9 @@ def validate(self, yaml_path: str, qa_cnt: int = 5, random_state: int = 42):
project_dir=temp_project_dir,
)
evaluator.start_trial(yaml_path)
qa_path.close()
corpus_path.close()
os.unlink(qa_path.name)
os.unlink(corpus_path.name)

logger.info("Validation complete.")
32 changes: 32 additions & 0 deletions docs/source/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,43 @@ To install AutoRAG, you can use pip:
pip install AutoRAG
```

Plus, it is recommended to install PyOpenSSL and nltk libraries for full features.

```bash
pip install --upgrade pyOpenSSL
pip install nltk
python3 -c "import nltk; nltk.download('punkt_tab')"
python3 -c "import nltk; nltk.download('averaged_perceptron_tagger_eng')"
```

```{admonition} Trouble with installation?
Do you have any trouble with installation?
First, you can check out the [troubleshooting](troubleshooting.md) page.
```

### Note for Windows Users
AutoRAG is not fully supported on Windows yet. There are several constraints for Windows users.

1. TART, UPR, and MonoT5 passage rerankers does not support Windows.
2. Parsing might be not working properly in the Windows environment.
3. Cannot use FlagEmbedding passage reranker with `batch` setting with 1. The default batch is 64.

Due to the constraints, we recommend using Docker images for running AutoRAG on Windows.

Plus, you MAKE SURE UPGRADE UP TO v0.3.1 for Windows users.

### Installation for Parsing 🌲

For parsing you need to install some local packages like [libmagic](https://man7.org/linux/man-pages/man3/libmagic.3.html),
[tesseract](https://github.com/tesseract-ocr/tesseract), and [poppler](https://poppler.freedesktop.org/).
The installation method depends upon your OS.

After installing this, you can install AutoRAG with parsing like below.

```bash
pip install AutoRAG[parse]
```

### Installation for Korean 🇰🇷

You can install optional dependencies for the Korean language.
Expand Down
2 changes: 1 addition & 1 deletion tests/autorag/data/chunk/test_chunk_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def test_run_chunker():
with tempfile.TemporaryDirectory() as temp_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
modules = [llama_index_chunk]
module_params = [{"chunk_method": "token"}]
summary_df = run_chunker(modules, module_params, parsed_result, temp_dir)
Expand Down
7 changes: 5 additions & 2 deletions tests/autorag/data/legacy/corpus/test_langchain.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
import os
import tempfile

import pytest
from langchain_core.documents import Document

from autorag.data.legacy.corpus import langchain_documents_to_parquet
from tests.autorag.data.legacy.corpus.test_base import validate_corpus
from tests.autorag.data.legacy.corpus.test_base_corpus_legacy import validate_corpus


@pytest.fixture
def parquet_filepath():
with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as temp_file:
yield temp_file.name
temp_file.close()
os.unlink(temp_file.name)


def test_langchain_documents_to_parquet(parquet_filepath):
Expand Down
7 changes: 5 additions & 2 deletions tests/autorag/data/legacy/corpus/test_llama_index_corpus.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import tempfile

import pytest
Expand All @@ -8,13 +9,15 @@
llama_documents_to_parquet,
llama_text_node_to_parquet,
)
from tests.autorag.data.legacy.corpus.test_base import validate_corpus
from tests.autorag.data.legacy.corpus.test_base_corpus_legacy import validate_corpus


@pytest.fixture
def parquet_filepath():
with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as temp_file:
yield temp_file.name
temp_file.close()
os.unlink(temp_file.name)


def test_llama_documents_to_parquet(parquet_filepath):
Expand Down
6 changes: 4 additions & 2 deletions tests/autorag/data/legacy/qacreation/test_base_qacreation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@

@pytest.fixture
def qa_parquet_filepath():
with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f:
yield f.name
f.close()
os.unlink(f.name)


@pytest.fixture
def chroma_persistent_client():
with tempfile.TemporaryDirectory() as temp_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
client = chromadb.PersistentClient(temp_dir)
yield client

Expand Down
4 changes: 2 additions & 2 deletions tests/autorag/data/legacy/qacreation/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@

@pytest.fixture
def load_dir():
with tempfile.TemporaryDirectory() as temp_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
yield temp_dir


@pytest.fixture
def output_filedir():
with tempfile.TemporaryDirectory() as temp_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
yield temp_dir


Expand Down
2 changes: 1 addition & 1 deletion tests/autorag/data/parse/test_parse_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def test_run_parser():
with tempfile.TemporaryDirectory() as temp_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
modules = [langchain_parse, langchain_parse]
module_params = [{"parse_method": "pdfminer"}, {"parse_method": "pdfplumber"}]
data_path_glob = eng_text_glob
Expand Down
2 changes: 1 addition & 1 deletion tests/autorag/data/parse/test_table_hybrid_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_table_hybrid_parse_only_table_node():


def test_save_page_by_table():
with tempfile.TemporaryDirectory() as save_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as save_dir:
text_dir = os.path.join(save_dir, "text")
table_dir = os.path.join(save_dir, "table")
os.makedirs(text_dir, exist_ok=True)
Expand Down
11 changes: 9 additions & 2 deletions tests/autorag/data/qa/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import tempfile

import pandas as pd
Expand Down Expand Up @@ -172,8 +173,10 @@ def test_update_corpus():
pd.testing.assert_frame_equal(
new_qa.data[["qid", "retrieval_gt"]], expected_dataframe
)
with tempfile.NamedTemporaryFile(suffix=".parquet") as qa_path:
with tempfile.NamedTemporaryFile(suffix=".parquet") as corpus_path:
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as qa_path:
with tempfile.NamedTemporaryFile(
suffix=".parquet", delete=False
) as corpus_path:
new_qa.to_parquet(qa_path.name, corpus_path.name)
loaded_qa = pd.read_parquet(qa_path.name, engine="pyarrow")
assert set(loaded_qa.columns) == {
Expand All @@ -184,3 +187,7 @@ def test_update_corpus():
}
loaded_corpus = pd.read_parquet(corpus_path.name, engine="pyarrow")
assert set(loaded_corpus.columns) == {"doc_id", "contents", "metadata"}
corpus_path.close()
os.unlink(corpus_path.name)
qa_path.close()
os.unlink(qa_path.name)
2 changes: 1 addition & 1 deletion tests/autorag/nodes/generator/test_run_generator_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

@pytest.fixture
def node_line_dir():
with tempfile.TemporaryDirectory() as temp_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
os.makedirs(os.path.join(temp_dir, "data"))
qa_df.to_parquet(os.path.join(temp_dir, "data", "qa.parquet"), index=False)
trial_dir = os.path.join(temp_dir, "test_trial")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

@pytest.fixture
def node_line_dir():
with tempfile.TemporaryDirectory() as project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
data_dir = os.path.join(project_dir, "data")
os.makedirs(data_dir)
qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@

@pytest.fixture
def node_line_dir():
with tempfile.TemporaryDirectory() as project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
data_dir = os.path.join(project_dir, "data")
os.makedirs(data_dir)
qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@

@pytest.fixture
def project_dir_with_corpus():
with tempfile.TemporaryDirectory() as temp_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
data_dir = os.path.join(temp_dir, "data")
os.makedirs(data_dir, exist_ok=True)
qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

@pytest.fixture
def node_line_dir():
with tempfile.TemporaryDirectory() as project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
data_dir = os.path.join(project_dir, "data")
os.makedirs(data_dir)
qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)
Expand Down
4 changes: 0 additions & 4 deletions tests/autorag/nodes/passagereranker/test_jina_reranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
previous_result,
base_reranker_node_test,
)
from tests.delete_tests import is_github_action


@pytest.mark.asyncio()
Expand Down Expand Up @@ -73,7 +72,6 @@ def jina_reranker_instance():
return JinaReranker(project_dir, "mock_api_key")


@pytest.mark.skipif(is_github_action(), reason="Skipping this test on GitHub Actions")
@patch.object(
autorag.nodes.passagereranker.jina, "jina_reranker_pure", mock_jina_reranker_pure
)
Expand All @@ -85,7 +83,6 @@ def test_jina_reranker(jina_reranker_instance):
base_reranker_test(contents_result, id_result, score_result, top_k)


@pytest.mark.skipif(is_github_action(), reason="Skipping this test on GitHub Actions")
@patch.object(
autorag.nodes.passagereranker.jina, "jina_reranker_pure", mock_jina_reranker_pure
)
Expand All @@ -102,7 +99,6 @@ def test_jina_reranker_batch_one(jina_reranker_instance):
base_reranker_test(contents_result, id_result, score_result, top_k)


@pytest.mark.skipif(is_github_action(), reason="Skipping this test on GitHub Actions")
@patch.object(
autorag.nodes.passagereranker.jina, "jina_reranker_pure", mock_jina_reranker_pure
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def base_reranker_node_test(result_df, top_k, use_ko=False, descending=True):

@pytest.fixture
def project_dir_with_corpus():
with tempfile.TemporaryDirectory() as temp_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
data_dir = os.path.join(temp_dir, "data")
os.makedirs(data_dir, exist_ok=True)
qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@

@pytest.fixture
def node_line_dir():
with tempfile.TemporaryDirectory() as project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
data_dir = os.path.join(project_dir, "data")
os.makedirs(data_dir)
qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)
Expand Down
2 changes: 1 addition & 1 deletion tests/autorag/nodes/promptmaker/test_prompt_maker_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_evaluate_one_prompt_maker_node():

@pytest.fixture
def node_line_dir():
with tempfile.TemporaryDirectory() as project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
data_dir = os.path.join(project_dir, "data")
os.makedirs(data_dir)
qa_data = pd.DataFrame(
Expand Down
2 changes: 1 addition & 1 deletion tests/autorag/nodes/promptmaker/test_window_replacement.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

@pytest.fixture
def pseudo_project_dir():
with tempfile.TemporaryDirectory() as project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
data_dir = os.path.join(project_dir, "data")
os.makedirs(data_dir)
corpus_df.to_parquet(os.path.join(data_dir, "corpus.parquet"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

@pytest.fixture
def ingested_vectordb_node():
with tempfile.TemporaryDirectory() as node_chroma_path:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as node_chroma_path:
node_db = chromadb.PersistentClient(path=node_chroma_path)
node_collection = node_db.create_collection(
name="openai", metadata={"hnsw:space": "cosine"}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

@pytest.fixture
def node_line_dir():
with tempfile.TemporaryDirectory() as project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
sample_project_dir = os.path.join(resources_dir, "sample_project")
# copy & paste all folders and files in sample_project folder
shutil.copytree(sample_project_dir, project_dir, dirs_exist_ok=True)
Expand Down
6 changes: 4 additions & 2 deletions tests/autorag/nodes/retrieval/test_bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,16 @@

@pytest.fixture
def ingested_bm25_path():
with tempfile.NamedTemporaryFile(suffix=".pkl", mode="w+b") as path:
with tempfile.NamedTemporaryFile(suffix=".pkl", mode="w+b", delete=False) as path:
bm25_ingest(path.name, corpus_df)
yield path.name
path.close()
os.unlink(path.name)


@pytest.fixture
def bm25_instance(ingested_bm25_path):
with tempfile.TemporaryDirectory() as temp_project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_project_dir:
os.makedirs(os.path.join(temp_project_dir, "resources"))
os.makedirs(os.path.join(temp_project_dir, "data"))
bm25_path = os.path.join(
Expand Down
2 changes: 1 addition & 1 deletion tests/autorag/nodes/retrieval/test_hybrid_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@

@pytest.fixture
def pseudo_project_dir():
with tempfile.TemporaryDirectory() as project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
corpus_df = pd.DataFrame(
{
"doc_id": [
Expand Down
4 changes: 2 additions & 2 deletions tests/autorag/nodes/retrieval/test_run_retrieval_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

@pytest.fixture
def node_line_dir():
with tempfile.TemporaryDirectory() as test_project_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as test_project_dir:
sample_project_dir = os.path.join(resources_dir, "sample_project")
# copy & paste all folders and files in sample_project folder
shutil.copytree(sample_project_dir, test_project_dir, dirs_exist_ok=True)
Expand Down Expand Up @@ -209,7 +209,7 @@ def pseudo_node_dir():
}
)

with tempfile.TemporaryDirectory() as node_dir:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as node_dir:
summary_df.to_csv(os.path.join(node_dir, "summary.csv"))
bm25_df.to_parquet(os.path.join(node_dir, "0.parquet"))
vector_openai_df.to_parquet(os.path.join(node_dir, "1.parquet"))
Expand Down
Loading

0 comments on commit db33d81

Please sign in to comment.