CamemBERT is a state-of-the-art language model for French based on the RoBERTa architecture pretrained on the French subcorpus of the available multilingual corpus OSCAR.
CamemBERT was initially evaluated in four different downstream tasks for French: part-of-speech (POS) tagging, dependency parsing, named entity recognition (NER) and natural language inference (NLI), improving the state of the art for most tasks over previous monolingual and multilingual approaches, which confirms the effectiveness of large pretrained language models for French.
@inproceedings{martin-etal-2020-camembert,
address = {Online},
author = {Martin, Louis and Muller, Benjamin and Ortiz Su{\'a}rez, Pedro Javier and Dupont, Yoann and Romary, Laurent and Villemonte de La Clergerie, {\'E}ric and Seddah, Djam{\'e} and Sagot, Beno{\^i}t},
title = {{C}amem{BERT}: a Tasty {F}rench Language Model},
year = {2020},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
pages = {7203--7219},
doi = {10.18653/v1/2020.acl-main.645},
url = {https://aclanthology.org/2020.acl-main.645},
hal_url = {https://hal.inria.fr/hal-02889805},
hal_pdf = {https://hal.inria.fr/hal-02889805/file/ACL_2020___CamemBERT__a_Tasty_French_Language_Model-6.pdf},
}
@inproceedings{martin-etal-2020-les,
address = {Nancy, France},
author = {Martin, Louis and Muller, Benjamin and Ortiz Su{\'a}rez, Pedro Javier and Dupont, Yoann and Romary, Laurent and Villemonte de la Clergerie, {\'E}ric and Sagot, Beno{\^\i}t and Seddah, Djam{\'e}},
title = {Les mod{\`e}les de langue contextuels Camembert pour le fran{\c{c}}ais : impact de la taille et de l{'}h{\'e}t{\'e}rog{\'e}n{\'e}it{\'e} des donn{\'e}es d{'}entrainement ({C}{AMEM}{BERT} Contextual Language Models for {F}rench: Impact of Training Data Size and Heterogeneity )},
year = {2020},
booktitle = {Actes de la 6e conf{\'e}rence conjointe Journ{\'e}es d'{\'E}tudes sur la Parole (JEP, 33e {\'e}dition), Traitement Automatique des Langues Naturelles (TALN, 27e {\'e}dition), Rencontre des {\'E}tudiants Chercheurs en Informatique pour le Traitement Automatique des Langues (R{\'E}CITAL, 22e {\'e}dition). Volume 2 : Traitement Automatique des Langues Naturelles},
publisher = {ATALA et AFCP},
pages = {54--65},
url = {https://aclanthology.org/2020.jeptalnrecital-taln.5},
hal_url = {https://hal.archives-ouvertes.fr/hal-02784755},
hal_pdf = {https://hal.archives-ouvertes.fr/hal-02784755v3/file/151.pdf},
language = {French},
}