diff --git a/autorag/utils/preprocess.py b/autorag/utils/preprocess.py index d4fa3b838..1dd7f2f26 100644 --- a/autorag/utils/preprocess.py +++ b/autorag/utils/preprocess.py @@ -3,6 +3,8 @@ import numpy as np import pandas as pd +from autorag.utils.util import normalize_unicode + def validate_qa_dataset(df: pd.DataFrame): columns = ['qid', 'query', 'retrieval_gt', 'generation_gt'] @@ -49,6 +51,8 @@ def cast_generation_gt(gt): "query must be string type." df['retrieval_gt'] = df['retrieval_gt'].apply(cast_retrieval_gt) df['generation_gt'] = df['generation_gt'].apply(cast_generation_gt) + df['query'] = df['query'].apply(normalize_unicode) + df['generation_gt'] = df['generation_gt'].apply(lambda x: list(map(normalize_unicode, x))) return df @@ -80,6 +84,19 @@ def make_prev_next_id_metadata(x, id_type: str): df['metadata'] = df['metadata'].apply(lambda x: make_prev_next_id_metadata(x, 'prev_id')) df['metadata'] = df['metadata'].apply(lambda x: make_prev_next_id_metadata(x, 'next_id')) + df['contents'] = df['contents'].apply(normalize_unicode) + + def normalize_unicode_metadata(metadata: dict): + result = {} + for key, value in metadata.items(): + if isinstance(value, str): + result[key] = normalize_unicode(value) + else: + result[key] = value + return result + + df['metadata'] = df['metadata'].apply(normalize_unicode_metadata) + # check every metadata have a prev_id, next_id key assert all('prev_id' in metadata for metadata in df['metadata']), "Every metadata must have a prev_id key." assert all('next_id' in metadata for metadata in df['metadata']), "Every metadata must have a next_id key." diff --git a/autorag/utils/util.py b/autorag/utils/util.py index 6f067d804..0a4077fbf 100644 --- a/autorag/utils/util.py +++ b/autorag/utils/util.py @@ -7,6 +7,7 @@ import os import re import string +import unicodedata from copy import deepcopy from typing import List, Callable, Dict, Optional, Any, Collection @@ -360,3 +361,7 @@ def filter_dict_keys(dict_, keys: List[str]): def split_dataframe(df, chunk_size): num_chunks = len(df) // chunk_size + 1 if len(df) % chunk_size != 0 else len(df) // chunk_size return list(map(lambda x: df[x * chunk_size:(x + 1) * chunk_size], range(num_chunks))) + + +def normalize_unicode(text: str) -> str: + return unicodedata.normalize('NFC', text) diff --git a/tests/autorag/utils/test_util.py b/tests/autorag/utils/test_util.py index 7429a5429..f977145e9 100644 --- a/tests/autorag/utils/test_util.py +++ b/tests/autorag/utils/test_util.py @@ -13,7 +13,7 @@ from autorag.utils import fetch_contents from autorag.utils.util import load_summary_file, result_to_dataframe, \ make_combinations, explode, replace_value_in_dict, normalize_string, convert_string_to_tuple_in_dict, process_batch, \ - convert_env_in_dict, openai_truncate_by_token, convert_datetime_string, split_dataframe + convert_env_in_dict, openai_truncate_by_token, convert_datetime_string, split_dataframe, normalize_unicode from tests.mock import MockLLM root_dir = pathlib.PurePath(os.path.dirname(os.path.realpath(__file__))).parent.parent @@ -331,3 +331,18 @@ def test_split_dataframe(): assert len(df_list_2[0]) == 3 assert len(df_list_2[-1]) == 1 assert pd.DataFrame({'a': list(range(3)), 'b': list(range(10, 13))}).equals(df_list_2[0]) + + +def test_normalize_unicode(): + str1 = "전국보행자전용도로표준데이터" + str2 = "전국보행자전용도로표준데이터" + assert len(str1) == 14 + assert len(str2) == 34 + assert str1 != str2 + + new_str1 = normalize_unicode(str1) + new_str2 = normalize_unicode(str2) + + assert len(new_str1) == 14 + assert len(new_str2) == 14 + assert new_str1 == new_str2