Datasets:
tokens
sequence
| ner_tags
sequence
| langs
sequence
| spans
sequence
|
---|---|---|---|
[
"Rumoh",
"Habib",
"Bugak",
"nyan",
"keunong",
"ngon",
"proyek",
"peuluwah",
"Masjidil",
"Haram",
"ngon",
"geugantoë",
"keuh",
"yum",
"tanoh",
"nyan",
"lé",
"peumeurèntah",
"Arab",
"Saudi",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
3,
4,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Masjidil Haram"
] |
[
"ALIH",
"Kōtō",
",",
"Tokyo"
] | [
0,
1,
2,
2
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Kōtō , Tokyo"
] |
[
"Bahsa",
"Arab"
] | [
3,
4
] | [
"ace",
"ace"
] | [
"ORG: Bahsa Arab"
] |
[
"Palèstina"
] | [
3
] | [
"ace"
] | [
"ORG: Palèstina"
] |
[
"Nabi",
"Musa",
"a.s",
"."
] | [
1,
2,
0,
0
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Nabi Musa"
] |
[
"Keuneulheuehjih",
",",
"la'ôt",
"Okhotsk",
"meuceuë",
"deungon",
"wilayah",
"duwa",
"boh",
"nanggroë",
"sagai",
",",
"Rusia",
"di",
"barat",
",",
"timu",
",",
"ngon",
"barôh",
",",
"ngon",
"Jeupun",
"di",
"tunong",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Rusia",
"LOC: Jeupun"
] |
[
"Jih",
"jiteuka",
"u",
"Batavia",
"bak",
"5",
"uroë",
"buleuën",
"1",
"thôn",
"1808",
",",
"jih",
"nyan",
"jijak",
"u",
"Hindia",
"Beulanda",
"nakeuh",
"keu",
"jijak",
"gantoë",
"Gubernur",
"Jeundran",
"Albertus",
"Wiese",
"."
] | [
0,
0,
0,
3,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Batavia"
] |
[
"ALIH",
"Geurija"
] | [
0,
3
] | [
"ace",
"ace"
] | [
"ORG: Geurija"
] |
[
"Bahsa",
"Arab"
] | [
3,
4
] | [
"ace",
"ace"
] | [
"ORG: Bahsa Arab"
] |
[
"ALIH",
"Paus",
"Fransiskus"
] | [
0,
1,
2
] | [
"ace",
"ace",
"ace"
] | [
"PER: Paus Fransiskus"
] |
[
"Amirika",
"Teungoh"
] | [
5,
6
] | [
"ace",
"ace"
] | [
"LOC: Amirika Teungoh"
] |
[
"Lisbon"
] | [
5
] | [
"ace"
] | [
"LOC: Lisbon"
] |
[
"Rasul-rasul",
"nyan",
"hana",
"geulakèe",
"upah",
"bak",
"geuda'wah",
"."
] | [
3,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Rasul-rasul"
] |
[
"Pulo",
"Saint",
"Helena",
"meusyeuhu",
"seubab",
"Napolèon",
"Bonaparté",
"tom",
"jiboh",
"keunoë",
"."
] | [
0,
0,
0,
0,
0,
1,
2,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Napolèon Bonaparté"
] |
[
"ALIH",
"Meuseujid",
"al-Aqsha"
] | [
0,
3,
4
] | [
"ace",
"ace",
"ace"
] | [
"ORG: Meuseujid al-Aqsha"
] |
[
"Centro",
"Federal",
"de",
"Educação",
"Tecnológica",
"Celso",
"Suckow",
"da",
"Fonseca"
] | [
3,
4,
4,
4,
4,
4,
4,
4,
4
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Centro Federal de Educação Tecnológica Celso Suckow da Fonseca"
] |
[
"Ulan",
"Bator",
"nakeuh",
"nang",
"nanggroë",
"nyang",
"jeuët",
"keu",
"banda",
"paléng",
"rayek",
"ngon",
"38",
"%",
"ureuëng",
"Mongolia",
"geuduëk",
"sinan",
"."
] | [
3,
4,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Ulan Bator"
] |
[
"Nabi",
"Nuh",
"a.s",
"."
] | [
1,
2,
0,
0
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Nabi Nuh"
] |
[
"Muskat"
] | [
5
] | [
"ace"
] | [
"LOC: Muskat"
] |
[
"Snouck",
"Hurgronje"
] | [
1,
2
] | [
"ace",
"ace"
] | [
"PER: Snouck Hurgronje"
] |
[
"Fathimah",
"neu",
"meunikah",
"ngon",
"Ali",
"bin",
"Abi",
"Thalib",
"di",
"Madinah",
",",
"Sayyidina",
"Ali",
"neujak",
"lakèe",
"Fathimah",
"langsông",
"bak",
"Nabi",
"."
] | [
0,
0,
0,
0,
1,
2,
2,
2,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Ali bin Abi Thalib"
] |
[
"Inggréh"
] | [
5
] | [
"ace"
] | [
"LOC: Inggréh"
] |
[
"'",
"''",
"Baramulla",
"''",
"'",
"nakeuh",
"saboh",
"banda",
"rayek",
"di",
"India",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: India"
] |
[
"ALIH",
"Taman",
"Nasional",
"Gunong",
"Leuser"
] | [
0,
3,
4,
4,
4
] | [
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Taman Nasional Gunong Leuser"
] |
[
"Riwayat",
"Nabi",
"Nuh",
"a.s",
"."
] | [
0,
1,
2,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Nabi Nuh"
] |
[
"Fathimah",
"az-Zahara"
] | [
1,
2
] | [
"ace",
"ace"
] | [
"PER: Fathimah az-Zahara"
] |
[
"Myanmar"
] | [
3
] | [
"ace"
] | [
"ORG: Myanmar"
] |
[
"'",
"''",
"Palèstina",
"''",
"'",
"(",
"''Filasṭīn",
"''",
"/",
"''Falasṭīn",
"''",
"/",
"''Filisṭīn",
"''",
")",
"nakeuh",
"saboh",
"neugara",
"nyang",
"na",
"di",
"wilayah",
"Asia",
"Barat",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
6,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Asia Barat"
] |
[
"ALIH",
"Nabi",
"Ya'qub"
] | [
0,
1,
2
] | [
"ace",
"ace",
"ace"
] | [
"PER: Nabi Ya'qub"
] |
[
"Masa",
"nyan",
"Acèh",
"mantong",
"lam",
"gabuëk",
"prang",
"antara",
"GAM",
"ngon",
"RI",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
3,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: GAM"
] |
[
"**",
"Nabi",
"Nuh",
"a.s",
"."
] | [
0,
1,
2,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Nabi Nuh"
] |
[
"'",
"''",
"Medgidia",
"''",
"'",
"nakeuh",
"saboh",
"banda",
"rayek",
"di",
"Rumania",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Rumania"
] |
[
"ALIH",
"Kabupatèn",
"Acèh",
"Barôh"
] | [
0,
5,
6,
6
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Kabupatèn Acèh Barôh"
] |
[
"Simpang",
"Tiga",
"Redelong"
] | [
3,
4,
4
] | [
"ace",
"ace",
"ace"
] | [
"ORG: Simpang Tiga Redelong"
] |
[
"Bà",
"Rịa-Vũng",
"Tàu"
] | [
3,
4,
4
] | [
"ace",
"ace",
"ace"
] | [
"ORG: Bà Rịa-Vũng Tàu"
] |
[
"Ali",
"bin",
"Abi",
"Thalib"
] | [
1,
2,
2,
2
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Ali bin Abi Thalib"
] |
[
"alih",
"Juventus",
"F.C",
"."
] | [
0,
3,
4,
4
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Juventus F.C ."
] |
[
"Mana",
"Ashida"
] | [
1,
2
] | [
"ace",
"ace"
] | [
"PER: Mana Ashida"
] |
[
"Palèstina"
] | [
3
] | [
"ace"
] | [
"ORG: Palèstina"
] |
[
"Nang",
"nanggroe",
"jih",
"nakeuh",
"Denpasar",
"."
] | [
0,
0,
0,
0,
5,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Denpasar"
] |
[
"Nang",
"nanggroë",
"kabupaten",
"Bener",
"Meriah",
"nakeuh",
"Simpang",
"Tiga",
"Redelong",
"."
] | [
0,
0,
0,
0,
0,
0,
3,
4,
4,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Simpang Tiga Redelong"
] |
[
"La'én",
"nibak",
"nyan",
",",
"Ureuëng",
"nyang",
"meu'èn",
"lam",
"drama",
"nyoe",
"nakeuh",
"Mana",
"Ashida",
"ngön",
"Fuku",
"Suzuki",
"seubagoe",
"duwa",
"droe",
"aneuek",
"miet",
"keumbeue",
"nyang",
"ayahjih",
"ka",
"abéh",
"umu",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
2,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Mana Ashida"
] |
[
"Jalal-Abad"
] | [
5
] | [
"ace"
] | [
"LOC: Jalal-Abad"
] |
[
"Luth",
"As"
] | [
3,
0
] | [
"ace",
"ace"
] | [
"ORG: Luth"
] |
[
"Neugara-neugara",
"nyang",
"rôh",
"lam",
"kawasan",
"'",
"''",
"Asia",
"Seulatan",
"''",
"'",
"nakeuh",
":"
] | [
0,
0,
0,
0,
0,
0,
0,
5,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Asia"
] |
[
"Oudomxai"
] | [
5
] | [
"ace"
] | [
"LOC: Oudomxai"
] |
[
"ALIH",
"Nabi",
"Sulaiman"
] | [
0,
1,
2
] | [
"ace",
"ace",
"ace"
] | [
"PER: Nabi Sulaiman"
] |
[
"Seubagoë",
"saboh",
"banda",
"nyang",
"leubèh",
"muda",
"meubandéng",
"ngon",
"nang",
"nanggroë",
"la'én",
"di",
"Asia",
"Teunggara",
"lagèe",
"Bangkok",
",",
"Jakarta",
",",
"ngon",
"Manila",
",",
"le",
"that",
"peuneudong",
"masa",
"kolonial",
"di",
"Kuala",
"Lumpur",
"nyang",
"geupeudong",
"bak",
"akhé",
"abad",
"keu-19",
"ngon",
"away",
"abad",
"keu-20",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
6,
0,
5,
0,
5,
0,
0,
5,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Asia Teunggara",
"LOC: Bangkok",
"LOC: Jakarta",
"LOC: Manila"
] |
[
"'",
"''",
"Propinsi",
"Kampong",
"Cham",
"''",
"'",
"nakeuh",
"saboh",
"propinsi",
"di",
"Kamboja",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Kamboja"
] |
[
"Mana",
"Ashida",
"seubagoe"
] | [
1,
2,
0
] | [
"ace",
"ace",
"ace"
] | [
"PER: Mana Ashida"
] |
[
"Nabi",
"Musa",
"a.s.",
"ngön",
"Fir'aun",
"."
] | [
1,
2,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Nabi Musa"
] |
[
"Ali",
"bin",
"Abi",
"Thalib"
] | [
1,
2,
2,
2
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Ali bin Abi Thalib"
] |
[
"Noemi"
] | [
1
] | [
"ace"
] | [
"PER: Noemi"
] |
[
"ALIH",
"Nabi",
"Ibrahim"
] | [
0,
1,
2
] | [
"ace",
"ace",
"ace"
] | [
"PER: Nabi Ibrahim"
] |
[
"'",
"''",
"Deva",
"''",
"'",
"nakeuh",
"saboh",
"banda",
"rayek",
"di",
"Rumania",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Rumania"
] |
[
"Mana",
"Ashida"
] | [
1,
2
] | [
"ace",
"ace"
] | [
"PER: Mana Ashida"
] |
[
"Bahsa",
"Arab"
] | [
3,
4
] | [
"ace",
"ace"
] | [
"ORG: Bahsa Arab"
] |
[
"Gimchaek"
] | [
5
] | [
"ace"
] | [
"LOC: Gimchaek"
] |
[
"ALIH",
"Keurajeuën",
"Samudra",
"Pasè"
] | [
0,
1,
2,
2
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Keurajeuën Samudra Pasè"
] |
[
"'",
"''",
"Sichuan",
"''",
"'",
"nakeuh",
"saboh",
"propinsi",
"di",
"Rèpublik",
"Rakyat",
"Cina",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
6,
6,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Rèpublik Rakyat Cina"
] |
[
"Bahsa",
"Arab"
] | [
3,
4
] | [
"ace",
"ace"
] | [
"ORG: Bahsa Arab"
] |
[
"'",
"''",
"Arunachal",
"Pradesh",
"''",
"'",
"nakeuh",
"saboh",
"nanggroe",
"dalam",
"di",
"India",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: India"
] |
[
"Palèstina"
] | [
3
] | [
"ace"
] | [
"ORG: Palèstina"
] |
[
"Nabi",
"Nuh",
"a.s.",
"ngön",
"kawômgeuh",
"."
] | [
1,
2,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Nabi Nuh"
] |
[
"PUPINAH",
"Sôleutan",
"Éseukanda",
"Muda"
] | [
0,
1,
2,
2
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Sôleutan Éseukanda Muda"
] |
[
"nakeuh",
"sidroe",
"aktris",
"Jeupang",
"nyang",
"jithèe",
"bak",
"drama",
"TV",
"Ashita",
",",
"Mama",
"ga",
"Inai",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
3,
4,
4,
4,
4,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Ashita , Mama ga Inai"
] |
[
"Klub",
"nyoe",
"geupeudong",
"bak",
"thon",
"1909",
"dan",
"na",
"stadion",
"droe",
"nyang",
"geubri",
"nan",
"'",
"''",
"Couto",
"Pereira",
"''",
"'",
"nyang",
"lot",
"ureueng",
"eu",
"37.182",
"droe",
"di",
"Curitiba",
",",
"Paraná",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Curitiba"
] |
[
"Tonga",
"(",
"dong",
"keudroë",
")"
] | [
5,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Tonga"
] |
[
"Bak",
"masa",
"jih",
"jeuet",
"keu",
"gubernur",
"jeundran",
"nyan",
"keuh",
"jipuga",
"keulayi",
"Meuseujid",
"Raya",
"Baiturrahman",
"nyang",
"ka",
"jitet",
"lé",
"teuntra",
"Beulanda",
"yôh",
"masa",
"Jeundran",
"Kohlër",
"thôn",
"1873",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
3,
4,
4,
0,
0,
0,
0,
0,
0,
0,
0,
1,
2,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Meuseujid Raya Baiturrahman",
"PER: Jeundran Kohlër"
] |
[
"Ishaq",
"As"
] | [
1,
0
] | [
"ace",
"ace"
] | [
"PER: Ishaq"
] |
[
"Nabi",
"Muhammad",
"saw",
"."
] | [
0,
1,
0,
0
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Muhammad"
] |
[
"ALIH",
"Brujuëk"
] | [
0,
5
] | [
"ace",
"ace"
] | [
"LOC: Brujuëk"
] |
[
"Bahsa",
"Arab"
] | [
3,
4
] | [
"ace",
"ace"
] | [
"ORG: Bahsa Arab"
] |
[
"Jinoe",
",",
"umu",
"gobnyan",
"nakeuh",
"66",
"thôn",
",",
"ayah",
"gobnyan",
"nakeuh",
"Soekarno",
"prèsidèn",
"phôn",
"Indônèsia",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Soekarno"
] |
[
"ALIH",
"Nabi",
"Idris"
] | [
0,
1,
2
] | [
"ace",
"ace",
"ace"
] | [
"PER: Nabi Idris"
] |
[
"Di",
"Acèh",
",",
"na",
"saboh",
"jangeun",
"meusyeuhu",
"nyang",
"meunan",
"``",
"Bungong",
"Seulanga",
"''",
"."
] | [
0,
5,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Acèh"
] |
[
"Kanada",
"''",
"(",
"Ratu",
"Elizabeth",
"II",
")",
"''"
] | [
5,
0,
0,
0,
1,
2,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Kanada",
"PER: Elizabeth II"
] |
[
"'",
"''",
"Hugo",
"Rafael",
"Chávez",
"Frías",
"''",
"'",
"(",
"1954-2013",
")",
"nakeuh",
"presiden",
"Vènèzuèla",
"jinoe",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Vènèzuèla"
] |
[
"Pakistan"
] | [
5
] | [
"ace"
] | [
"LOC: Pakistan"
] |
[
"Palèstina"
] | [
3
] | [
"ace"
] | [
"ORG: Palèstina"
] |
[
"Di",
"Kuala",
"Lumpur",
"teudong",
"keuh",
"Parlemen",
"Malaysia",
"."
] | [
0,
0,
0,
0,
0,
3,
4,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Parlemen Malaysia"
] |
[
"Da'irah",
"di",
"Acèh",
"nyang",
"paléng",
"manyang",
"nakeuh",
"Gunong",
"Leuser",
"nyang",
"manyang",
"jih",
"3.404",
"mètè",
"di",
"ateuh",
"la'ôt",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
5,
6,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Gunong Leuser"
] |
[
"Bahsa",
"Arab"
] | [
3,
4
] | [
"ace",
"ace"
] | [
"ORG: Bahsa Arab"
] |
[
"Jim",
"Morrison"
] | [
1,
2
] | [
"ace",
"ace"
] | [
"PER: Jim Morrison"
] |
[
"Panama"
] | [
5
] | [
"ace"
] | [
"LOC: Panama"
] |
[
"Departemen",
"Agama",
"RI",
"."
] | [
0,
3,
0,
0
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Agama"
] |
[
"ALIH",
"Salahuddin",
"al-Ayyubi"
] | [
0,
1,
2
] | [
"ace",
"ace",
"ace"
] | [
"PER: Salahuddin al-Ayyubi"
] |
[
"Nabi",
"Musa",
"a.s",
"."
] | [
1,
2,
0,
0
] | [
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Nabi Musa"
] |
[
"Universitas",
"Niccolò",
"Cusano"
] | [
3,
4,
4
] | [
"ace",
"ace",
"ace"
] | [
"ORG: Universitas Niccolò Cusano"
] |
[
"Lam",
"taman",
"nyoë",
"na",
"meupadum",
"boh",
"taman",
"khusuih",
"lagèe",
"Taman",
"Bambang",
",",
"Taman",
"Rusa",
",",
"Taman",
"Anggrek",
",",
"Taman",
"Bungong",
"Raya",
",",
"ngon",
"Taman",
"Cicém",
"Kuala",
"Lumpur",
"(",
"taman",
"cicém",
"nyang",
"paléng",
"rayeuek",
"di",
"Asia",
"Teunggara",
")",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
3,
4,
4,
4,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Taman Cicém Kuala Lumpur"
] |
[
"Bahsa",
"Arab"
] | [
3,
4
] | [
"ace",
"ace"
] | [
"ORG: Bahsa Arab"
] |
[
"'",
"''",
"Budapèst",
"''",
"'",
"nakeuh",
"nang",
"nanggroe",
"Hongaria",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
5,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Hongaria"
] |
[
"'",
"''",
"Qinghai",
"''",
"'",
"nakeuh",
"saboh",
"propinsi",
"di",
"Rèpublik",
"Rakyat",
"Cina",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
6,
6,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Rèpublik Rakyat Cina"
] |
[
"Kheun",
"lé",
"gata",
"(",
"hai",
"Muhammad",
")",
",",
"``",
"Gobnyankeuh",
"Tuhan",
"nyang",
"saboh",
"(",
"sidroe",
")",
"."
] | [
0,
0,
0,
0,
0,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Muhammad"
] |
[
"Surat",
"Al-Lahab"
] | [
3,
4
] | [
"ace",
"ace"
] | [
"ORG: Surat Al-Lahab"
] |
[
"Antigua",
"ngon",
"Barbuda",
"''",
"(",
"Ratu",
"Elizabeth",
"II",
")",
"''"
] | [
5,
6,
6,
0,
0,
0,
1,
2,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Antigua ngon Barbuda",
"PER: Elizabeth II"
] |
[
"Mana",
"Ashida",
"seubagoe",
"Miu",
"Kinoshita"
] | [
1,
2,
0,
0,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"PER: Mana Ashida"
] |
[
"Laos"
] | [
5
] | [
"ace"
] | [
"LOC: Laos"
] |
[
"'",
"''",
"Pyeongyang",
"''",
"'",
"nakeuh",
"nang",
"nanggroë",
"di",
"Korèa",
"Utara",
"."
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
5,
6,
0
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"LOC: Korèa Utara"
] |
[
"'",
"''",
"Naypyidaw",
"''",
"'",
"nakeuh",
"nang",
"nanggroe",
"di",
"Myanmar"
] | [
0,
0,
0,
0,
0,
0,
0,
0,
0,
3
] | [
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace",
"ace"
] | [
"ORG: Myanmar"
] |
Dataset Card for WikiANN
Dataset Summary
WikiANN (sometimes called PAN-X) is a multilingual named entity recognition dataset consisting of Wikipedia articles annotated with LOC (location), PER (person), and ORG (organisation) tags in the IOB2 format. This version corresponds to the balanced train, dev, and test splits of Rahimi et al. (2019), which supports 176 of the 282 languages from the original WikiANN corpus.
Supported Tasks and Leaderboards
named-entity-recognition
: The dataset can be used to train a model for named entity recognition in many languages, or evaluate the zero-shot cross-lingual capabilities of multilingual models.
Languages
The dataset contains 176 languages, one in each of the configuration subsets. The corresponding BCP 47 language tags are:
Language tag | |
---|---|
ace | ace |
af | af |
als | als |
am | am |
an | an |
ang | ang |
ar | ar |
arc | arc |
arz | arz |
as | as |
ast | ast |
ay | ay |
az | az |
ba | ba |
bar | bar |
be | be |
bg | bg |
bh | bh |
bn | bn |
bo | bo |
br | br |
bs | bs |
ca | ca |
cdo | cdo |
ce | ce |
ceb | ceb |
ckb | ckb |
co | co |
crh | crh |
cs | cs |
csb | csb |
cv | cv |
cy | cy |
da | da |
de | de |
diq | diq |
dv | dv |
el | el |
en | en |
eo | eo |
es | es |
et | et |
eu | eu |
ext | ext |
fa | fa |
fi | fi |
fo | fo |
fr | fr |
frr | frr |
fur | fur |
fy | fy |
ga | ga |
gan | gan |
gd | gd |
gl | gl |
gn | gn |
gu | gu |
hak | hak |
he | he |
hi | hi |
hr | hr |
hsb | hsb |
hu | hu |
hy | hy |
ia | ia |
id | id |
ig | ig |
ilo | ilo |
io | io |
is | is |
it | it |
ja | ja |
jbo | jbo |
jv | jv |
ka | ka |
kk | kk |
km | km |
kn | kn |
ko | ko |
ksh | ksh |
ku | ku |
ky | ky |
la | la |
lb | lb |
li | li |
lij | lij |
lmo | lmo |
ln | ln |
lt | lt |
lv | lv |
mg | mg |
mhr | mhr |
mi | mi |
min | min |
mk | mk |
ml | ml |
mn | mn |
mr | mr |
ms | ms |
mt | mt |
mwl | mwl |
my | my |
mzn | mzn |
nap | nap |
nds | nds |
ne | ne |
nl | nl |
nn | nn |
no | no |
nov | nov |
oc | oc |
or | or |
os | os |
other-bat-smg | sgs |
other-be-x-old | be-tarask |
other-cbk-zam | cbk |
other-eml | eml |
other-fiu-vro | vro |
other-map-bms | jv-x-bms |
other-simple | en-basiceng |
other-zh-classical | lzh |
other-zh-min-nan | nan |
other-zh-yue | yue |
pa | pa |
pdc | pdc |
pl | pl |
pms | pms |
pnb | pnb |
ps | ps |
pt | pt |
qu | qu |
rm | rm |
ro | ro |
ru | ru |
rw | rw |
sa | sa |
sah | sah |
scn | scn |
sco | sco |
sd | sd |
sh | sh |
si | si |
sk | sk |
sl | sl |
so | so |
sq | sq |
sr | sr |
su | su |
sv | sv |
sw | sw |
szl | szl |
ta | ta |
te | te |
tg | tg |
th | th |
tk | tk |
tl | tl |
tr | tr |
tt | tt |
ug | ug |
uk | uk |
ur | ur |
uz | uz |
vec | vec |
vep | vep |
vi | vi |
vls | vls |
vo | vo |
wa | wa |
war | war |
wuu | wuu |
xmf | xmf |
yi | yi |
yo | yo |
zea | zea |
zh | zh |
Dataset Structure
Data Instances
This is an example in the "train" split of the "af" (Afrikaans language) configuration subset:
{
'tokens': ['Sy', 'ander', 'seun', ',', 'Swjatopolk', ',', 'was', 'die', 'resultaat', 'van', '’n', 'buite-egtelike', 'verhouding', '.'],
'ner_tags': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'langs': ['af', 'af', 'af', 'af', 'af', 'af', 'af', 'af', 'af', 'af', 'af', 'af', 'af', 'af'],
'spans': ['PER: Swjatopolk']
}
Data Fields
tokens
: alist
ofstring
features.langs
: alist
ofstring
features that correspond to the language of each token.ner_tags
: alist
of classification labels, with possible values includingO
(0),B-PER
(1),I-PER
(2),B-ORG
(3),I-ORG
(4),B-LOC
(5),I-LOC
(6).spans
: alist
ofstring
features, that is the list of named entities in the input text formatted as<TAG>: <mention>
Data Splits
For each configuration subset, the data is split into "train", "validation" and "test" sets, each containing the following number of examples:
Train | Validation | Test | |
---|---|---|---|
ace | 100 | 100 | 100 |
af | 5000 | 1000 | 1000 |
als | 100 | 100 | 100 |
am | 100 | 100 | 100 |
an | 1000 | 1000 | 1000 |
ang | 100 | 100 | 100 |
ar | 20000 | 10000 | 10000 |
arc | 100 | 100 | 100 |
arz | 100 | 100 | 100 |
as | 100 | 100 | 100 |
ast | 1000 | 1000 | 1000 |
ay | 100 | 100 | 100 |
az | 10000 | 1000 | 1000 |
ba | 100 | 100 | 100 |
bar | 100 | 100 | 100 |
bat-smg | 100 | 100 | 100 |
be | 15000 | 1000 | 1000 |
be-x-old | 5000 | 1000 | 1000 |
bg | 20000 | 10000 | 10000 |
bh | 100 | 100 | 100 |
bn | 10000 | 1000 | 1000 |
bo | 100 | 100 | 100 |
br | 1000 | 1000 | 1000 |
bs | 15000 | 1000 | 1000 |
ca | 20000 | 10000 | 10000 |
cbk-zam | 100 | 100 | 100 |
cdo | 100 | 100 | 100 |
ce | 100 | 100 | 100 |
ceb | 100 | 100 | 100 |
ckb | 1000 | 1000 | 1000 |
co | 100 | 100 | 100 |
crh | 100 | 100 | 100 |
cs | 20000 | 10000 | 10000 |
csb | 100 | 100 | 100 |
cv | 100 | 100 | 100 |
cy | 10000 | 1000 | 1000 |
da | 20000 | 10000 | 10000 |
de | 20000 | 10000 | 10000 |
diq | 100 | 100 | 100 |
dv | 100 | 100 | 100 |
el | 20000 | 10000 | 10000 |
eml | 100 | 100 | 100 |
en | 20000 | 10000 | 10000 |
eo | 15000 | 10000 | 10000 |
es | 20000 | 10000 | 10000 |
et | 15000 | 10000 | 10000 |
eu | 10000 | 10000 | 10000 |
ext | 100 | 100 | 100 |
fa | 20000 | 10000 | 10000 |
fi | 20000 | 10000 | 10000 |
fiu-vro | 100 | 100 | 100 |
fo | 100 | 100 | 100 |
fr | 20000 | 10000 | 10000 |
frr | 100 | 100 | 100 |
fur | 100 | 100 | 100 |
fy | 1000 | 1000 | 1000 |
ga | 1000 | 1000 | 1000 |
gan | 100 | 100 | 100 |
gd | 100 | 100 | 100 |
gl | 15000 | 10000 | 10000 |
gn | 100 | 100 | 100 |
gu | 100 | 100 | 100 |
hak | 100 | 100 | 100 |
he | 20000 | 10000 | 10000 |
hi | 5000 | 1000 | 1000 |
hr | 20000 | 10000 | 10000 |
hsb | 100 | 100 | 100 |
hu | 20000 | 10000 | 10000 |
hy | 15000 | 1000 | 1000 |
ia | 100 | 100 | 100 |
id | 20000 | 10000 | 10000 |
ig | 100 | 100 | 100 |
ilo | 100 | 100 | 100 |
io | 100 | 100 | 100 |
is | 1000 | 1000 | 1000 |
it | 20000 | 10000 | 10000 |
ja | 20000 | 10000 | 10000 |
jbo | 100 | 100 | 100 |
jv | 100 | 100 | 100 |
ka | 10000 | 10000 | 10000 |
kk | 1000 | 1000 | 1000 |
km | 100 | 100 | 100 |
kn | 100 | 100 | 100 |
ko | 20000 | 10000 | 10000 |
ksh | 100 | 100 | 100 |
ku | 100 | 100 | 100 |
ky | 100 | 100 | 100 |
la | 5000 | 1000 | 1000 |
lb | 5000 | 1000 | 1000 |
li | 100 | 100 | 100 |
lij | 100 | 100 | 100 |
lmo | 100 | 100 | 100 |
ln | 100 | 100 | 100 |
lt | 10000 | 10000 | 10000 |
lv | 10000 | 10000 | 10000 |
map-bms | 100 | 100 | 100 |
mg | 100 | 100 | 100 |
mhr | 100 | 100 | 100 |
mi | 100 | 100 | 100 |
min | 100 | 100 | 100 |
mk | 10000 | 1000 | 1000 |
ml | 10000 | 1000 | 1000 |
mn | 100 | 100 | 100 |
mr | 5000 | 1000 | 1000 |
ms | 20000 | 1000 | 1000 |
mt | 100 | 100 | 100 |
mwl | 100 | 100 | 100 |
my | 100 | 100 | 100 |
mzn | 100 | 100 | 100 |
nap | 100 | 100 | 100 |
nds | 100 | 100 | 100 |
ne | 100 | 100 | 100 |
nl | 20000 | 10000 | 10000 |
nn | 20000 | 1000 | 1000 |
no | 20000 | 10000 | 10000 |
nov | 100 | 100 | 100 |
oc | 100 | 100 | 100 |
or | 100 | 100 | 100 |
os | 100 | 100 | 100 |
pa | 100 | 100 | 100 |
pdc | 100 | 100 | 100 |
pl | 20000 | 10000 | 10000 |
pms | 100 | 100 | 100 |
pnb | 100 | 100 | 100 |
ps | 100 | 100 | 100 |
pt | 20000 | 10000 | 10000 |
qu | 100 | 100 | 100 |
rm | 100 | 100 | 100 |
ro | 20000 | 10000 | 10000 |
ru | 20000 | 10000 | 10000 |
rw | 100 | 100 | 100 |
sa | 100 | 100 | 100 |
sah | 100 | 100 | 100 |
scn | 100 | 100 | 100 |
sco | 100 | 100 | 100 |
sd | 100 | 100 | 100 |
sh | 20000 | 10000 | 10000 |
si | 100 | 100 | 100 |
simple | 20000 | 1000 | 1000 |
sk | 20000 | 10000 | 10000 |
sl | 15000 | 10000 | 10000 |
so | 100 | 100 | 100 |
sq | 5000 | 1000 | 1000 |
sr | 20000 | 10000 | 10000 |
su | 100 | 100 | 100 |
sv | 20000 | 10000 | 10000 |
sw | 1000 | 1000 | 1000 |
szl | 100 | 100 | 100 |
ta | 15000 | 1000 | 1000 |
te | 1000 | 1000 | 1000 |
tg | 100 | 100 | 100 |
th | 20000 | 10000 | 10000 |
tk | 100 | 100 | 100 |
tl | 10000 | 1000 | 1000 |
tr | 20000 | 10000 | 10000 |
tt | 1000 | 1000 | 1000 |
ug | 100 | 100 | 100 |
uk | 20000 | 10000 | 10000 |
ur | 20000 | 1000 | 1000 |
uz | 1000 | 1000 | 1000 |
vec | 100 | 100 | 100 |
vep | 100 | 100 | 100 |
vi | 20000 | 10000 | 10000 |
vls | 100 | 100 | 100 |
vo | 100 | 100 | 100 |
wa | 100 | 100 | 100 |
war | 100 | 100 | 100 |
wuu | 100 | 100 | 100 |
xmf | 100 | 100 | 100 |
yi | 100 | 100 | 100 |
yo | 100 | 100 | 100 |
zea | 100 | 100 | 100 |
zh | 20000 | 10000 | 10000 |
zh-classical | 100 | 100 | 100 |
zh-min-nan | 100 | 100 | 100 |
zh-yue | 20000 | 10000 | 10000 |
Dataset Creation
Curation Rationale
[More Information Needed]
Source Data
Initial Data Collection and Normalization
[More Information Needed]
Who are the source language producers?
[More Information Needed]
Annotations
Annotation process
[More Information Needed]
Who are the annotators?
[More Information Needed]
Personal and Sensitive Information
[More Information Needed]
Considerations for Using the Data
Social Impact of Dataset
[More Information Needed]
Discussion of Biases
[More Information Needed]
Other Known Limitations
[More Information Needed]
Additional Information
Dataset Curators
[More Information Needed]
Licensing Information
[More Information Needed]
Citation Information
The original 282 datasets are associated with this article
@inproceedings{pan-etal-2017-cross,
title = "Cross-lingual Name Tagging and Linking for 282 Languages",
author = "Pan, Xiaoman and
Zhang, Boliang and
May, Jonathan and
Nothman, Joel and
Knight, Kevin and
Ji, Heng",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P17-1178",
doi = "10.18653/v1/P17-1178",
pages = "1946--1958",
abstract = "The ambitious goal of this work is to develop a cross-lingual name tagging and linking framework for 282 languages that exist in Wikipedia. Given a document in any of these languages, our framework is able to identify name mentions, assign a coarse-grained or fine-grained type to each mention, and link it to an English Knowledge Base (KB) if it is linkable. We achieve this goal by performing a series of new KB mining methods: generating {``}silver-standard{''} annotations by transferring annotations from English to other languages through cross-lingual links and KB properties, refining annotations through self-training and topic selection, deriving language-specific morphology features from anchor links, and mining word translation pairs from cross-lingual links. Both name tagging and linking results for 282 languages are promising on Wikipedia data and on-Wikipedia data.",
}
while the 176 languages supported in this version are associated with the following article
@inproceedings{rahimi-etal-2019-massively,
title = "Massively Multilingual Transfer for {NER}",
author = "Rahimi, Afshin and
Li, Yuan and
Cohn, Trevor",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1015",
pages = "151--164",
}
Contributions
- Downloads last month
- 254,485