Skip to content

Commit

Permalink
Resolve Korean encoding unicode error (Marker-Inc-Korea#444)
Browse files Browse the repository at this point in the history
* add normalize_unicode function for fix this problem

* add normalize_unicode to cast corpus and qa dataset

---------

Co-authored-by: jeffrey <vkefhdl1@gmail.com>
  • Loading branch information
vkehfdl1 and jeffrey authored May 16, 2024
1 parent 7711b19 commit 31ac05d
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 1 deletion.
17 changes: 17 additions & 0 deletions autorag/utils/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pandas as pd

from autorag.utils.util import normalize_unicode


def validate_qa_dataset(df: pd.DataFrame):
columns = ['qid', 'query', 'retrieval_gt', 'generation_gt']
Expand Down Expand Up @@ -49,6 +51,8 @@ def cast_generation_gt(gt):
"query must be string type."
df['retrieval_gt'] = df['retrieval_gt'].apply(cast_retrieval_gt)
df['generation_gt'] = df['generation_gt'].apply(cast_generation_gt)
df['query'] = df['query'].apply(normalize_unicode)
df['generation_gt'] = df['generation_gt'].apply(lambda x: list(map(normalize_unicode, x)))
return df


Expand Down Expand Up @@ -80,6 +84,19 @@ def make_prev_next_id_metadata(x, id_type: str):
df['metadata'] = df['metadata'].apply(lambda x: make_prev_next_id_metadata(x, 'prev_id'))
df['metadata'] = df['metadata'].apply(lambda x: make_prev_next_id_metadata(x, 'next_id'))

df['contents'] = df['contents'].apply(normalize_unicode)

def normalize_unicode_metadata(metadata: dict):
result = {}
for key, value in metadata.items():
if isinstance(value, str):
result[key] = normalize_unicode(value)
else:
result[key] = value
return result

df['metadata'] = df['metadata'].apply(normalize_unicode_metadata)

# check every metadata have a prev_id, next_id key
assert all('prev_id' in metadata for metadata in df['metadata']), "Every metadata must have a prev_id key."
assert all('next_id' in metadata for metadata in df['metadata']), "Every metadata must have a next_id key."
Expand Down
5 changes: 5 additions & 0 deletions autorag/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import re
import string
import unicodedata
from copy import deepcopy
from typing import List, Callable, Dict, Optional, Any, Collection

Expand Down Expand Up @@ -360,3 +361,7 @@ def filter_dict_keys(dict_, keys: List[str]):
def split_dataframe(df, chunk_size):
num_chunks = len(df) // chunk_size + 1 if len(df) % chunk_size != 0 else len(df) // chunk_size
return list(map(lambda x: df[x * chunk_size:(x + 1) * chunk_size], range(num_chunks)))


def normalize_unicode(text: str) -> str:
return unicodedata.normalize('NFC', text)
17 changes: 16 additions & 1 deletion tests/autorag/utils/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from autorag.utils import fetch_contents
from autorag.utils.util import load_summary_file, result_to_dataframe, \
make_combinations, explode, replace_value_in_dict, normalize_string, convert_string_to_tuple_in_dict, process_batch, \
convert_env_in_dict, openai_truncate_by_token, convert_datetime_string, split_dataframe
convert_env_in_dict, openai_truncate_by_token, convert_datetime_string, split_dataframe, normalize_unicode
from tests.mock import MockLLM

root_dir = pathlib.PurePath(os.path.dirname(os.path.realpath(__file__))).parent.parent
Expand Down Expand Up @@ -331,3 +331,18 @@ def test_split_dataframe():
assert len(df_list_2[0]) == 3
assert len(df_list_2[-1]) == 1
assert pd.DataFrame({'a': list(range(3)), 'b': list(range(10, 13))}).equals(df_list_2[0])


def test_normalize_unicode():
str1 = "전국보행자전용도로표준데이터"
str2 = "전국보행자전용도로표준데이터"
assert len(str1) == 14
assert len(str2) == 34
assert str1 != str2

new_str1 = normalize_unicode(str1)
new_str2 = normalize_unicode(str2)

assert len(new_str1) == 14
assert len(new_str2) == 14
assert new_str1 == new_str2

0 comments on commit 31ac05d

Please sign in to comment.