νκ΅μ΄ μΈμ΄λͺ¨λΈ μ μμ μν νμ΄μ¬ κΈ°λ° νκ΅μ΄ ν μ€νΈμ²λ¦¬ ν¨ν€μ§μ λλ€.
- νμΌ κ΄λ¦¬
- μμ¬μ΄ ν
μ€νΈνμΌ μ½κΈ°μ μ°κΈ°, νμΌ ν΅ν©
- μΈμ½λ© λ³ν
- ν
μ€νΈ μ²λ¦¬
- μμ¬μ μΈ κ³΅λ°± μ 리
- TEI ν€λ μ κ±°
- λ¬Έμ₯λ³ μ΄μ λͺ©λ‘ μ 리
- ν
μ€νΈ μ κ·ν
- κΈΈκ² μ΄μ΄μ§ μ½νΌμ€λ₯Ό λ¬Έμ₯ λ¨μλ‘ μλ₯΄κΈ°
- νκΈμ΄ μλ λ¬Έμ μμ
- νκ΅μ΄λ‘ λ μ€κΈ μΈμ λ¬Έμλ€ μ μ¬
- νκΈ μλͺ¨, νμ, μ«μ, μνλ²³, μλ¨μ΄ μ½κΈ°
- ννμ λΆμ (KoNLPy λ° Mecab μ°λ)
- ννμ λΆμ κ²°κ³Όλ‘λΆν° 2κ°μ§ μ νμ μμ¬ννμ(pseudo-morpheme) μμ±
- μ΅μννμ (λͺ¨λ νν κ²½κ³λ₯Ό λΆλ¦¬ν΄ κ°μ₯ μκ² μλ¦° λ¨μ; micro)
- μ€κ°ννμ (체μΈκ³Ό μ‘°μ¬λ§μ λΆλ¦¬ν΄ μ€κ° ν¬κΈ°λ‘ μλ¦° λ¨μ ; medium)
NB. μμ¬ννμ μμ±μ μν΄μλ ννμ λΆμμ΄ μλ£λ ν
μ€νΈκ° νμν©λλ€.
λ³Έ μ½λλ μΈμ°λ UTagger ννμλΆμ μμνμ μ μ νμ¬ μμ¬ννμλ₯Ό μμ±νκΈ°μ,
μ
λ ₯λλ ν
μ€νΈ νμΌμ΄ UTagger μμνκ³Ό λ€λ₯Ό κ²½μ° μΆκ°μ μΈ μ½λ μμ μμ
μ΄ νμν©λλ€.
- λ¬Έμμ΄λ‘λΆν° λ°μμ΄ μμ±(Grapheme-to-Phone; G2P)
- μΈμ΄λͺ¨λΈ μ μμ μν νμΌ μμ±
- μ μ λ μ½νΌμ€ μλ¬Έ(textraw) μμ±
- λ°μμ¬μ (lexicon.txt) μμ±
- Python 2.7 or 3
- Required Python packages:
- KoNLPy, JPype1, korean, hanja, Mecab
- [Note] The above packages are automatically installed as you install KoLM via pip
-
The latest version is available in PyPI:
$ pip install kolm
-
λ§λμΉ μ μ μμ κ°μ΄λ:
- (1) λͺ¨λ ν
μ€νΈλ₯Ό UTF-8λ‘ μΈμ½λ© λ³ν
- utils.convertEncoding
- (2) λͺ¨λ ν
μ€νΈλ₯Ό νλλ‘ μ΄μ΄λΆμ¬ μ μ₯νκΈ°
- utils.stackFiles
- (3) TEI ν€λ (λλ λΆμλμμ΄ μλ νκ·Έλ₯) μ κ±°
- utils.removeHeader
- (4) ν
μ€νΈ μ κ·ν
- normalize.Knormalize
- (5) ννμλΆμ
- tag.morphTag
- (6) μλ¬Έ-ννμ λμ‘°λ₯Ό ν΅ν μμ¬ννμ μΆμΆ
- tag.pseudomorph
- (7) μ μ ν
μ€νΈ(textraw)μ λ°μμ¬μ (lexicon.txt) μμ±
- lm.writeTextraw
- lm.getUniqueWords
- lm.writeLexicon
- (1) λͺ¨λ ν
μ€νΈλ₯Ό UTF-8λ‘ μΈμ½λ© λ³ν
-
ꡬ체μ μΈ μ¬μ© μμ μ½λλ₯Ό λ³΄λ €λ©΄ runKoLM.py λ₯Ό μ°Έμ‘°νμΈμ.
-
Start by importing every method in kolm.utils
>> from kolm.utils import *
- File management
-
readfileUTF8 (fname)
# UTF-8 μΈμ½λ©λ νΉμ νμΌ(song15.txt)μ μ½μ΄λ€μ΄κΈ° >> readfileUTF8('song15.txt') -
writefile (body, fname)
# mydir λ΄ λͺ¨λ ν μ€νΈμ μΈμ½λ©μ UTF-16μμ UTF-8λ‘ λ³ννκΈ° >> convertEncoding('mydir', 'utf-16', 'utf-8') -
stackFiles (path, stackFname, flist=[])
# mydir λ΄ λͺ¨λ ν μ€νΈλ₯Ό ν νμΌλ‘ λͺ¨μ mystack.txt λ‘ μ μ₯νκΈ° >> stackFiles('mydir', 'mystack.txt') # mydir λ΄ νΉμ νμΌλ€(song1.txt, song2.txt, song15.txt)μ ν νμΌλ‘ λͺ¨μ mystack.txt λ‘ μ μ₯νκΈ° >> stackFiles('mydir', 'mystack.txt', ['song1.txt', 'song2.txt', 'song15.txt'])
-
- Encoding
-
convertEncoding (path, encodingSource, encodingDest, flist=[])
# mydir λ΄ λͺ¨λ ν μ€νΈμ μΈμ½λ©μ UTF-16μμ UTF-8λ‘ λ³ν >> convertEncoding('mydir', 'utf-16', 'utf-8') # mydir λ΄ νΉμ νμΌλ€(song1.txt, song2.txt, song15.txt)μ μΈμ½λ©μ UTF-16μμ UTF-8λ‘ λ³ν >> convertEncoding('mydir', 'utf-16', 'utf-8', ['song1.txt', 'song2.txt', 'song15.txt'])
-
- Text management
-
tightenString (corpus)
# ν μ€νΈ 리μ€νΈ λ΄ μμ¬μ μΈ κ³΅λ°± μ 리 λ° μμ >> tightenString(corpus) -
getEojeolList (sentlist)
# λ¬Έμ₯ 리μ€νΈμμ μ΄μ 리μ€νΈ μΆμΆ >> getEojeolList(['μ§§μ λ¬Έμ₯μ λ£μλ€', 'μν΄ λ³΅', 'μ§μ κ°λλ λ°₯μ΄ μλ€') -
removeHeader (headeredfname)
>> convertEncoding('mydir', 'utf-16', 'utf-8')
-
-
Start by importing every method in kolm.normalize
>> from kolm.normalize import *
- Normalization
-
Knormalize (in_fname, out_fname)
# Normalize a textfile >> Knormalize(in_fname, out_fname) -
normalize (corpus)
# Normalize a text list variable in workspace >> normalize(corpus) -
bySentence (corpus)
>> bySentence(corpus) -
removeNonHangul (line)
>> removeNonHangul(line)
-
- Character reading in Korean
-
Alphabets
-
readABC (line)
>> readABC(line) -
readAlphabet (line)
>> readAlphabet(line)
-
-
Hanja (Chinese characters)
-
readHanja (line)
>> readHanja(line)
-
-
Hangul jamos (i.e. single letters which do not make a syllable)
-
readHangulLetter (line)
>> readHangulLetter('γ μ γ μΌλ‘ μ μλ€') μΉμμ μ§μμΌλ‘ μ μλ€
-
-
- Number reading in Korean
-
readNumber (line)
>> readNumber(line)
-
-
Start by importing every method in kolm.tag
>> from kolm.tag import *
- Morphemes
-
morphTag (in_fname, out_fname)
# Mecab ννμλΆμ >> morphTag(in_fname, out_fname)
-
- Pseudo-morphemes
-
morph2pseudo (raw_sentlist, morph_sentlist, type)
# λ¬Έμ₯ 리μ€νΈλ‘λΆν° μμ¬ννμ(μ΅μ ν¬κΈ°) λ¬Έμ₯ 리μ€νΈ μμ± >> morph2pseudo(raw_sentlist, morph_sentlist, 'micro') # λ¬Έμ₯ 리μ€νΈλ‘λΆν° μμ¬ννμ(μ€κ° ν¬κΈ°) λ¬Έμ₯ 리μ€νΈ μμ± >> morph2pseudo(raw_sentlist, morph_sentlist, 'medium') -
pseudomorph (rawText, morphText, pseudoType)
# λ¬Έμ₯ νλλ‘λΆν° μμ¬ννμ(μ΅μ ν¬κΈ°) λ¬Έμ₯ μμ± >> pseudomorph(rawText, morphText, 'micro') # λ¬Έμ₯ νλλ‘λΆν° μμ¬ννμ(μ€κ° ν¬κΈ°) λ¬Έμ₯ μμ± >> pseudomorph(rawText, morphText, 'medium')
-
-
Start by importing every method in kolm.lm
>> from kolm.lm import * -
writeTextraw (corpus)
# μ μ λ₯Ό λ§μΉ λ¨μΌ λ§λμΉ νμΌ(textraw) μμ± >> writeTextraw(corpus) -
getUniqueWords (text_fname)
# κ³ μ μ΄μ (λλ ννμ; λ§λμΉ μμ λμ΄μ°κΈ°λ λ¨μλ₯Ό μλ―Έ)λͺ©λ‘(wordlist.txt) μΆμΆ >> getUniqueWords(text_fname) -
writeLexicon (text_fname)
# κ³ μ μ΄μ λͺ©λ‘μ G2Pλ₯Ό μ μ©ν λ°μμ¬μ (lexicon.txt) μμ± >> writeLexicon(text_fname)
-
Start by importing every method in kolm.g2p
>> from kolm.g2p import *
- Main
-
runKoG2P (hangeul_sequence, rulebook_path)
# Run Korean G2P on a sequence >> runKoG2P(hangeul_sequence, rulebook_path) -
runTest (rulebook, testset)
# Run a test on a testset with a specific rulebook >> runTest(rulebook, testset) -
readRules (pver, rulebook)
>> readRules(pver, rulebook)
-
- Auxiliaries
-
phone2prono (phones, rule_in, rule_out)
>> phone2prono(phones, rule_in, rule_out) -
graph2prono (graph, rule_in, rule_out)
>> graph2prono(graph, rule_in, rule_out) -
graph2phone (graphs)
>> graph2phone(graphs) -
isHangul (charint)
>> isHangul(charint)
-