@inproceedings{hoenen-etal-2020-two,
title = "Two {LRL} {\&} Distractor Corpora from Web Information Retrieval and a Small Case Study in Language Identification without Training Corpora",
author = "Hoenen, Armin and
Koc, Cemre and
Rahn, Marc",
editor = "Beermann, Dorothee and
Besacier, Laurent and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources association",
url = "https://aclanthology.org/2020.sltu-1.4/",
pages = "28--35",
language = "eng",
ISBN = "979-10-95546-35-1",
abstract = "In recent years, low resource languages (LRLs) have seen a surge in interest after certain tasks have been solved for larger ones and as they present various challenges (data sparsity, sparsity of experts and expertise, unusual structural properties etc.). For a larger number of them in the wake of this interest resources and technologies have been created. However, there are very small languages for which this has not yet led to a significant change. We focus here one such language (Nogai) and one larger small language (Maori). Since especially smaller languages often face the situation of having very similar siblings or a larger small sister language which is more accessible, the rate of noise in data gathered on them so far is often high. Therefore, we present small corpora for our 2 case study languages which we obtained through web information retrieval and likewise for their noise inducing distractor languages and conduct a small language identification experiment where we identify documents in a boolean way as either belonging or not to the target language. We release our test corpora for two such scenarios in the format of the An Crubadan project (Scannell, 2007) and a tool for unsupervised language identification using alphabet and toponym information."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hoenen-etal-2020-two">
<titleInfo>
<title>Two LRL & Distractor Corpora from Web Information Retrieval and a Small Case Study in Language Identification without Training Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Armin</namePart>
<namePart type="family">Hoenen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cemre</namePart>
<namePart type="family">Koc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Rahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dorothee</namePart>
<namePart type="family">Beermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-35-1</identifier>
</relatedItem>
<abstract>In recent years, low resource languages (LRLs) have seen a surge in interest after certain tasks have been solved for larger ones and as they present various challenges (data sparsity, sparsity of experts and expertise, unusual structural properties etc.). For a larger number of them in the wake of this interest resources and technologies have been created. However, there are very small languages for which this has not yet led to a significant change. We focus here one such language (Nogai) and one larger small language (Maori). Since especially smaller languages often face the situation of having very similar siblings or a larger small sister language which is more accessible, the rate of noise in data gathered on them so far is often high. Therefore, we present small corpora for our 2 case study languages which we obtained through web information retrieval and likewise for their noise inducing distractor languages and conduct a small language identification experiment where we identify documents in a boolean way as either belonging or not to the target language. We release our test corpora for two such scenarios in the format of the An Crubadan project (Scannell, 2007) and a tool for unsupervised language identification using alphabet and toponym information.</abstract>
<identifier type="citekey">hoenen-etal-2020-two</identifier>
<location>
<url>https://aclanthology.org/2020.sltu-1.4/</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>28</start>
<end>35</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Two LRL & Distractor Corpora from Web Information Retrieval and a Small Case Study in Language Identification without Training Corpora
%A Hoenen, Armin
%A Koc, Cemre
%A Rahn, Marc
%Y Beermann, Dorothee
%Y Besacier, Laurent
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)
%D 2020
%8 May
%I European Language Resources association
%C Marseille, France
%@ 979-10-95546-35-1
%G eng
%F hoenen-etal-2020-two
%X In recent years, low resource languages (LRLs) have seen a surge in interest after certain tasks have been solved for larger ones and as they present various challenges (data sparsity, sparsity of experts and expertise, unusual structural properties etc.). For a larger number of them in the wake of this interest resources and technologies have been created. However, there are very small languages for which this has not yet led to a significant change. We focus here one such language (Nogai) and one larger small language (Maori). Since especially smaller languages often face the situation of having very similar siblings or a larger small sister language which is more accessible, the rate of noise in data gathered on them so far is often high. Therefore, we present small corpora for our 2 case study languages which we obtained through web information retrieval and likewise for their noise inducing distractor languages and conduct a small language identification experiment where we identify documents in a boolean way as either belonging or not to the target language. We release our test corpora for two such scenarios in the format of the An Crubadan project (Scannell, 2007) and a tool for unsupervised language identification using alphabet and toponym information.
%U https://aclanthology.org/2020.sltu-1.4/
%P 28-35
Markdown (Informal)
[Two LRL & Distractor Corpora from Web Information Retrieval and a Small Case Study in Language Identification without Training Corpora](https://aclanthology.org/2020.sltu-1.4/) (Hoenen et al., SLTU 2020)
ACL