OSCAR or Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the web-based “Common Crawl” corpus.
OSCAR is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.
OSCAR is currently shuffled at line level and no metadata is provided. Thus it is mainly intended to be used in the training of unsupervised language models for natural language processing.
The data is distributed by language in both original and deduplicated form. There are currently 166 different languages available.
See also goclassy and Ungoliant.
Logo by Alix Chagué.
@inproceedings{ortizsuarez:hal-02148693,
address = {Cardiff, United Kingdom},
author = {Ortiz Su{\'a}rez, Pedro Javier and Sagot, Beno{\^i}t and Romary, Laurent},
title = {{Asynchronous Pipeline for Processing Huge Corpora on Medium to Low Resource Infrastructures}},
year = {2019},
booktitle = {{7th Workshop on the Challenges in the Management of Large Corpora (CMLC-7)}},
publisher = {{Leibniz-Institut f{\"u}r Deutsche Sprache}},
editor = {Piotr Ba{\'n}ski and Adrien Barbaresi and Hanno Biber and Evelyn Breiteneder and Simon Clematide and Marc Kupietz and Harald L{\"u}ngen and Caroline Iliadi},
doi = {10.14618/IDS-PUB-9021},
url = {https://inria.hal.science/hal-02148693},
hal_pdf = {https://inria.hal.science/hal-02148693v1/file/Asynchronous_Pipeline_for_Processing_Huge_Corpora_on_Medium_to_Low_Resource_Infrastructures.pdf},
}
@inproceedings{ortiz-suarez-etal-2020-monolingual,
address = {Online},
author = {Ortiz Su{\'a}rez, Pedro Javier and Romary, Laurent and Sagot, Beno{\^\i}t},
title = {A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages},
year = {2020},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
publisher = {Association for Computational Linguistics},
pages = {1703--1714},
doi = {10.18653/v1/2020.acl-main.156},
url = {https://aclanthology.org/2020.acl-main.156},
hal_url = {https://hal.inria.fr/hal-02863875},
hal_pdf = {https://hal.inria.fr/hal-02863875v2/file/ELMos.pdf},
}
@inproceedings{ortiz-suarez-etal-2020-establishing,
address = {Marseille, France},
author = {Ortiz Su{\'a}rez, Pedro Javier and Dupont, Yoann and Muller, Benjamin and Romary, Laurent and Sagot, Beno{\^i}t},
title = {Establishing a New State-of-the-Art for {F}rench Named Entity Recognition},
year = {2020},
booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference},
publisher = {European Language Resources Association},
pages = {4631--4638},
url = {https://aclanthology.org/2020.lrec-1.569},
hal_url = {https://hal.inria.fr/hal-02617950},
hal_pdf = {https://hal.inria.fr/hal-02617950v2/file/lrec19ner.pdf},
}
pedro.ortiz-suarez[at]inria.fr, Benoit.Sagot[at]inria.fr and julien.abadji[at]inria.fr