Windows support (partially) AutoRAG (#766)

* change tempfile.NamedTemporaryFile to windows compatible * change tempfile.TemporaryDirectory to windows compatible * hotfix jina reranker error about event loop - You have to pass the event loop to the aiohttp.ClientSession() * change file name to resolve the error about duplicate file name * add description about pyopenssl, nltk, and windows + parsing installs --------- Co-authored-by: jeffrey <vkefhdl1@gmail.com>
Marker-Inc-Korea · Sep 30, 2024 · db33d81 · db33d81
1 parent 14ead70
commit db33d81
Show file tree

Hide file tree

Showing 35 changed files with 122 additions and 59 deletions.
diff --git a/autorag/data/parse/table_hybrid_parse.py b/autorag/data/parse/table_hybrid_parse.py
@@ -31,7 +31,7 @@ def table_hybrid_parse(
 	:return: tuple of lists containing the parsed texts, path and pages.
 	"""
 	# make save folder directory
-	with tempfile.TemporaryDirectory() as save_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as save_dir:
 		text_dir = os.path.join(save_dir, "text")
 		table_dir = os.path.join(save_dir, "table")
 

diff --git a/autorag/nodes/passagereranker/jina.py b/autorag/nodes/passagereranker/jina.py
@@ -30,7 +30,7 @@ def __init__(self, project_dir: str, api_key: str = None, *args, **kwargs):
 					"API key is not provided."
 					"You can set it as an argument or as an environment variable 'JINAAI_API_KEY'"
 				)
-		self.session = aiohttp.ClientSession()
+		self.session = aiohttp.ClientSession(loop=get_event_loop())
 		self.session.headers.update(
 			{"Authorization": f"Bearer {api_key}", "Accept-Encoding": "identity"}
 		)

diff --git a/autorag/validator.py b/autorag/validator.py
@@ -65,9 +65,9 @@ def validate(self, yaml_path: str, qa_cnt: int = 5, random_state: int = 42):
 
 		# start Evaluate at temp project directory
 		with (
-			tempfile.NamedTemporaryFile(suffix=".parquet") as qa_path,
-			tempfile.NamedTemporaryFile(suffix=".parquet") as corpus_path,
-			tempfile.TemporaryDirectory() as temp_project_dir,
+			tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as qa_path,
+			tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as corpus_path,
+			tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_project_dir,
 		):
 			sample_qa_df.to_parquet(qa_path.name, index=False)
 			sample_corpus_df.to_parquet(corpus_path.name, index=False)
@@ -78,5 +78,9 @@ def validate(self, yaml_path: str, qa_cnt: int = 5, random_state: int = 42):
 				project_dir=temp_project_dir,
 			)
 			evaluator.start_trial(yaml_path)
+			qa_path.close()
+			corpus_path.close()
+			os.unlink(qa_path.name)
+			os.unlink(corpus_path.name)
 
 		logger.info("Validation complete.")
diff --git a/docs/source/install.md b/docs/source/install.md
@@ -13,11 +13,43 @@ To install AutoRAG, you can use pip:
 pip install AutoRAG
 ```
 
+Plus, it is recommended to install PyOpenSSL and nltk libraries for full features.
+
+```bash
+pip install --upgrade pyOpenSSL
+pip install nltk
+python3 -c "import nltk; nltk.download('punkt_tab')"
+python3 -c "import nltk; nltk.download('averaged_perceptron_tagger_eng')"
+```
+
 ```{admonition} Trouble with installation?
 Do you have any trouble with installation?
 First, you can check out the [troubleshooting](troubleshooting.md) page.
 ```
 
+### Note for Windows Users
+AutoRAG is not fully supported on Windows yet. There are several constraints for Windows users.
+
+1. TART, UPR, and MonoT5 passage rerankers does not support Windows.
+2. Parsing might be not working properly in the Windows environment.
+3. Cannot use FlagEmbedding passage reranker with `batch` setting with 1. The default batch is 64.
+
+Due to the constraints, we recommend using Docker images for running AutoRAG on Windows.
+
+Plus, you MAKE SURE UPGRADE UP TO v0.3.1 for Windows users.
+
+### Installation for Parsing 🌲
+
+For parsing you need to install some local packages like [libmagic](https://man7.org/linux/man-pages/man3/libmagic.3.html),
+[tesseract](https://github.com/tesseract-ocr/tesseract), and [poppler](https://poppler.freedesktop.org/).
+The installation method depends upon your OS.
+
+After installing this, you can install AutoRAG with parsing like below.
+
+```bash
+pip install AutoRAG[parse]
+```
+
 ### Installation for Korean 🇰🇷
 
 You can install optional dependencies for the Korean language.

diff --git a/tests/autorag/data/chunk/test_chunk_run.py b/tests/autorag/data/chunk/test_chunk_run.py
@@ -7,7 +7,7 @@
 
 
 def test_run_chunker():
-	with tempfile.TemporaryDirectory() as temp_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
 		modules = [llama_index_chunk]
 		module_params = [{"chunk_method": "token"}]
 		summary_df = run_chunker(modules, module_params, parsed_result, temp_dir)

diff --git a/...s/autorag/data/legacy/corpus/test_base.py → .../legacy/corpus/test_base_corpus_legacy.py b/...s/autorag/data/legacy/corpus/test_base.py → .../legacy/corpus/test_base_corpus_legacy.py
diff --git a/tests/autorag/data/legacy/corpus/test_langchain.py b/tests/autorag/data/legacy/corpus/test_langchain.py
@@ -1,16 +1,19 @@
+import os
 import tempfile
 
 import pytest
 from langchain_core.documents import Document
 
 from autorag.data.legacy.corpus import langchain_documents_to_parquet
-from tests.autorag.data.legacy.corpus.test_base import validate_corpus
+from tests.autorag.data.legacy.corpus.test_base_corpus_legacy import validate_corpus
 
 
 @pytest.fixture
 def parquet_filepath():
-	with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
+	with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as temp_file:
 		yield temp_file.name
+		temp_file.close()
+		os.unlink(temp_file.name)
 
 
 def test_langchain_documents_to_parquet(parquet_filepath):

diff --git a/tests/autorag/data/legacy/corpus/test_llama_index_corpus.py b/tests/autorag/data/legacy/corpus/test_llama_index_corpus.py
@@ -1,3 +1,4 @@
+import os
 import tempfile
 
 import pytest
@@ -8,13 +9,15 @@
 	llama_documents_to_parquet,
 	llama_text_node_to_parquet,
 )
-from tests.autorag.data.legacy.corpus.test_base import validate_corpus
+from tests.autorag.data.legacy.corpus.test_base_corpus_legacy import validate_corpus
 
 
 @pytest.fixture
 def parquet_filepath():
-	with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
+	with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as temp_file:
 		yield temp_file.name
+		temp_file.close()
+		os.unlink(temp_file.name)
 
 
 def test_llama_documents_to_parquet(parquet_filepath):

diff --git a/tests/autorag/data/legacy/qacreation/test_base_qacreation.py b/tests/autorag/data/legacy/qacreation/test_base_qacreation.py
@@ -28,13 +28,15 @@
 
 @pytest.fixture
 def qa_parquet_filepath():
-	with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
+	with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f:
 		yield f.name
+		f.close()
+		os.unlink(f.name)
 
 
 @pytest.fixture
 def chroma_persistent_client():
-	with tempfile.TemporaryDirectory() as temp_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
 		client = chromadb.PersistentClient(temp_dir)
 		yield client
 

diff --git a/tests/autorag/data/legacy/qacreation/test_simple.py b/tests/autorag/data/legacy/qacreation/test_simple.py
@@ -27,13 +27,13 @@
 
 @pytest.fixture
 def load_dir():
-	with tempfile.TemporaryDirectory() as temp_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
 		yield temp_dir
 
 
 @pytest.fixture
 def output_filedir():
-	with tempfile.TemporaryDirectory() as temp_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
 		yield temp_dir
 
 

diff --git a/tests/autorag/data/parse/test_parse_run.py b/tests/autorag/data/parse/test_parse_run.py
@@ -8,7 +8,7 @@
 
 
 def test_run_parser():
-	with tempfile.TemporaryDirectory() as temp_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
 		modules = [langchain_parse, langchain_parse]
 		module_params = [{"parse_method": "pdfminer"}, {"parse_method": "pdfplumber"}]
 		data_path_glob = eng_text_glob

diff --git a/tests/autorag/data/parse/test_table_hybrid_parse.py b/tests/autorag/data/parse/test_table_hybrid_parse.py
@@ -80,7 +80,7 @@ def test_table_hybrid_parse_only_table_node():
 
 
 def test_save_page_by_table():
-	with tempfile.TemporaryDirectory() as save_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as save_dir:
 		text_dir = os.path.join(save_dir, "text")
 		table_dir = os.path.join(save_dir, "table")
 		os.makedirs(text_dir, exist_ok=True)

diff --git a/tests/autorag/data/qa/test_schema.py b/tests/autorag/data/qa/test_schema.py
@@ -1,3 +1,4 @@
+import os
 import tempfile
 
 import pandas as pd
@@ -172,8 +173,10 @@ def test_update_corpus():
 	pd.testing.assert_frame_equal(
 		new_qa.data[["qid", "retrieval_gt"]], expected_dataframe
 	)
-	with tempfile.NamedTemporaryFile(suffix=".parquet") as qa_path:
-		with tempfile.NamedTemporaryFile(suffix=".parquet") as corpus_path:
+	with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as qa_path:
+		with tempfile.NamedTemporaryFile(
+			suffix=".parquet", delete=False
+		) as corpus_path:
 			new_qa.to_parquet(qa_path.name, corpus_path.name)
 			loaded_qa = pd.read_parquet(qa_path.name, engine="pyarrow")
 			assert set(loaded_qa.columns) == {
@@ -184,3 +187,7 @@ def test_update_corpus():
 			}
 			loaded_corpus = pd.read_parquet(corpus_path.name, engine="pyarrow")
 			assert set(loaded_corpus.columns) == {"doc_id", "contents", "metadata"}
+			corpus_path.close()
+			os.unlink(corpus_path.name)
+		qa_path.close()
+		os.unlink(qa_path.name)
diff --git a/tests/autorag/nodes/generator/test_run_generator_node.py b/tests/autorag/nodes/generator/test_run_generator_node.py
@@ -41,7 +41,7 @@
 
 @pytest.fixture
 def node_line_dir():
-	with tempfile.TemporaryDirectory() as temp_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
 		os.makedirs(os.path.join(temp_dir, "data"))
 		qa_df.to_parquet(os.path.join(temp_dir, "data", "qa.parquet"), index=False)
 		trial_dir = os.path.join(temp_dir, "test_trial")

diff --git a/tests/autorag/nodes/passageaugmenter/test_run_passage_augmenter.py b/tests/autorag/nodes/passageaugmenter/test_run_passage_augmenter.py
@@ -19,7 +19,7 @@
 
 @pytest.fixture
 def node_line_dir():
-	with tempfile.TemporaryDirectory() as project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
 		data_dir = os.path.join(project_dir, "data")
 		os.makedirs(data_dir)
 		qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)

diff --git a/tests/autorag/nodes/passagecompressor/test_run_passage_compressor_node.py b/tests/autorag/nodes/passagecompressor/test_run_passage_compressor_node.py
@@ -80,7 +80,7 @@
 
 @pytest.fixture
 def node_line_dir():
-	with tempfile.TemporaryDirectory() as project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
 		data_dir = os.path.join(project_dir, "data")
 		os.makedirs(data_dir)
 		qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)

diff --git a/tests/autorag/nodes/passagefilter/test_passage_filter_base.py b/tests/autorag/nodes/passagefilter/test_passage_filter_base.py
@@ -65,7 +65,7 @@
 
 @pytest.fixture
 def project_dir_with_corpus():
-	with tempfile.TemporaryDirectory() as temp_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
 		data_dir = os.path.join(temp_dir, "data")
 		os.makedirs(data_dir, exist_ok=True)
 		qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)

diff --git a/tests/autorag/nodes/passagefilter/test_passage_filter_run.py b/tests/autorag/nodes/passagefilter/test_passage_filter_run.py
@@ -19,7 +19,7 @@
 
 @pytest.fixture
 def node_line_dir():
-	with tempfile.TemporaryDirectory() as project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
 		data_dir = os.path.join(project_dir, "data")
 		os.makedirs(data_dir)
 		qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)

diff --git a/tests/autorag/nodes/passagereranker/test_jina_reranker.py b/tests/autorag/nodes/passagereranker/test_jina_reranker.py
@@ -16,7 +16,6 @@
 	previous_result,
 	base_reranker_node_test,
 )
-from tests.delete_tests import is_github_action
 
 
 @pytest.mark.asyncio()
@@ -73,7 +72,6 @@ def jina_reranker_instance():
 	return JinaReranker(project_dir, "mock_api_key")
 
 
-@pytest.mark.skipif(is_github_action(), reason="Skipping this test on GitHub Actions")
 @patch.object(
 	autorag.nodes.passagereranker.jina, "jina_reranker_pure", mock_jina_reranker_pure
 )
@@ -85,7 +83,6 @@ def test_jina_reranker(jina_reranker_instance):
 	base_reranker_test(contents_result, id_result, score_result, top_k)
 
 
-@pytest.mark.skipif(is_github_action(), reason="Skipping this test on GitHub Actions")
 @patch.object(
 	autorag.nodes.passagereranker.jina, "jina_reranker_pure", mock_jina_reranker_pure
 )
@@ -102,7 +99,6 @@ def test_jina_reranker_batch_one(jina_reranker_instance):
 	base_reranker_test(contents_result, id_result, score_result, top_k)
 
 
-@pytest.mark.skipif(is_github_action(), reason="Skipping this test on GitHub Actions")
 @patch.object(
 	autorag.nodes.passagereranker.jina, "jina_reranker_pure", mock_jina_reranker_pure
 )

diff --git a/tests/autorag/nodes/passagereranker/test_passage_reranker_base.py b/tests/autorag/nodes/passagereranker/test_passage_reranker_base.py
@@ -98,7 +98,7 @@ def base_reranker_node_test(result_df, top_k, use_ko=False, descending=True):
 
 @pytest.fixture
 def project_dir_with_corpus():
-	with tempfile.TemporaryDirectory() as temp_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
 		data_dir = os.path.join(temp_dir, "data")
 		os.makedirs(data_dir, exist_ok=True)
 		qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)

diff --git a/tests/autorag/nodes/passagereranker/test_passage_reranker_run.py b/tests/autorag/nodes/passagereranker/test_passage_reranker_run.py
@@ -81,7 +81,7 @@
 
 @pytest.fixture
 def node_line_dir():
-	with tempfile.TemporaryDirectory() as project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
 		data_dir = os.path.join(project_dir, "data")
 		os.makedirs(data_dir)
 		qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False)

diff --git a/tests/autorag/nodes/promptmaker/test_prompt_maker_run.py b/tests/autorag/nodes/promptmaker/test_prompt_maker_run.py
@@ -88,7 +88,7 @@ def test_evaluate_one_prompt_maker_node():
 
 @pytest.fixture
 def node_line_dir():
-	with tempfile.TemporaryDirectory() as project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
 		data_dir = os.path.join(project_dir, "data")
 		os.makedirs(data_dir)
 		qa_data = pd.DataFrame(

diff --git a/tests/autorag/nodes/promptmaker/test_window_replacement.py b/tests/autorag/nodes/promptmaker/test_window_replacement.py
@@ -16,7 +16,7 @@
 
 @pytest.fixture
 def pseudo_project_dir():
-	with tempfile.TemporaryDirectory() as project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
 		data_dir = os.path.join(project_dir, "data")
 		os.makedirs(data_dir)
 		corpus_df.to_parquet(os.path.join(data_dir, "corpus.parquet"))

diff --git a/tests/autorag/nodes/queryexpansion/test_query_expansion_base.py b/tests/autorag/nodes/queryexpansion/test_query_expansion_base.py
@@ -27,7 +27,7 @@
 
 @pytest.fixture
 def ingested_vectordb_node():
-	with tempfile.TemporaryDirectory() as node_chroma_path:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as node_chroma_path:
 		node_db = chromadb.PersistentClient(path=node_chroma_path)
 		node_collection = node_db.create_collection(
 			name="openai", metadata={"hnsw:space": "cosine"}

diff --git a/tests/autorag/nodes/queryexpansion/test_query_expansion_run.py b/tests/autorag/nodes/queryexpansion/test_query_expansion_run.py
@@ -38,7 +38,7 @@
 
 @pytest.fixture
 def node_line_dir():
-	with tempfile.TemporaryDirectory() as project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
 		sample_project_dir = os.path.join(resources_dir, "sample_project")
 		# copy & paste all folders and files in sample_project folder
 		shutil.copytree(sample_project_dir, project_dir, dirs_exist_ok=True)

diff --git a/tests/autorag/nodes/retrieval/test_bm25.py b/tests/autorag/nodes/retrieval/test_bm25.py
@@ -36,14 +36,16 @@
 
 @pytest.fixture
 def ingested_bm25_path():
-	with tempfile.NamedTemporaryFile(suffix=".pkl", mode="w+b") as path:
+	with tempfile.NamedTemporaryFile(suffix=".pkl", mode="w+b", delete=False) as path:
 		bm25_ingest(path.name, corpus_df)
 		yield path.name
+		path.close()
+		os.unlink(path.name)
 
 
 @pytest.fixture
 def bm25_instance(ingested_bm25_path):
-	with tempfile.TemporaryDirectory() as temp_project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_project_dir:
 		os.makedirs(os.path.join(temp_project_dir, "resources"))
 		os.makedirs(os.path.join(temp_project_dir, "data"))
 		bm25_path = os.path.join(

diff --git a/tests/autorag/nodes/retrieval/test_hybrid_base.py b/tests/autorag/nodes/retrieval/test_hybrid_base.py
@@ -83,7 +83,7 @@
 
 @pytest.fixture
 def pseudo_project_dir():
-	with tempfile.TemporaryDirectory() as project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as project_dir:
 		corpus_df = pd.DataFrame(
 			{
 				"doc_id": [

diff --git a/tests/autorag/nodes/retrieval/test_run_retrieval_node.py b/tests/autorag/nodes/retrieval/test_run_retrieval_node.py
@@ -24,7 +24,7 @@
 
 @pytest.fixture
 def node_line_dir():
-	with tempfile.TemporaryDirectory() as test_project_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as test_project_dir:
 		sample_project_dir = os.path.join(resources_dir, "sample_project")
 		# copy & paste all folders and files in sample_project folder
 		shutil.copytree(sample_project_dir, test_project_dir, dirs_exist_ok=True)
@@ -209,7 +209,7 @@ def pseudo_node_dir():
 		}
 	)
 
-	with tempfile.TemporaryDirectory() as node_dir:
+	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as node_dir:
 		summary_df.to_csv(os.path.join(node_dir, "summary.csv"))
 		bm25_df.to_parquet(os.path.join(node_dir, "0.parquet"))
 		vector_openai_df.to_parquet(os.path.join(node_dir, "1.parquet"))