diff --git a/tokenizer.json b/tokenizer.json
index 5998042..08c3696 100644
--- a/tokenizer.json
+++ b/tokenizer.json
@@ -21,15 +21,6 @@
"rstrip": false,
"normalized": false
},
- {
- "id": 2,
- "special": true,
- "content": "[SPACE]",
- "single_word": false,
- "lstrip": false,
- "rstrip": false,
- "normalized": true
- },
{
"id": 255,
"special": true,
@@ -1064,69 +1055,199 @@
"rstrip": false,
"normalized": false,
"special": true
+ },
+ {
+ "id": 6561,
+ "content": "[START_SPEECH]",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 6562,
+ "content": "[STOP_SPEECH]",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 6563,
+ "content": "[EXAGGERATION]",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
}
],
"normalizer": {
- "type": "Sequence",
- "normalizers": [
- {
- "type": "Replace",
- "pattern": { "String": " " },
- "content": "[SPACE]"
- }
- ]
- },
- "pre_tokenizer": {
- "type": "Whitespace"
+ "type": "Replace",
+ "pattern": {
+ "Regex": "\\s+"
+ },
+ "content": " "
},
+ "pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } },
- { "SpecialToken": { "id": "BOS", "type_id": 0 } },
- { "Sequence": { "id": "A", "type_id": 0 } },
- { "SpecialToken": { "id": "EOS", "type_id": 0 } },
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } }
+ {
+ "SpecialToken": {
+ "id": "[EXAGGERATION]",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START]",
+ "type_id": 0
+ }
+ },
+ {
+ "Sequence": {
+ "id": "A",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[STOP]",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START_SPEECH]",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START_SPEECH]",
+ "type_id": 0
+ }
+ }
],
"pair": [
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } },
- { "SpecialToken": { "id": "BOS", "type_id": 0 } },
- { "Sequence": { "id": "A", "type_id": 0 } },
- { "SpecialToken": { "id": "EOS", "type_id": 0 } },
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 1 } },
- { "SpecialToken": { "id": "BOS", "type_id": 1 } },
- { "Sequence": { "id": "B", "type_id": 1 } },
- { "SpecialToken": { "id": "EOS", "type_id": 1 } },
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } },
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } }
+ {
+ "SpecialToken": {
+ "id": "[EXAGGERATION]",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START]",
+ "type_id": 0
+ }
+ },
+ {
+ "Sequence": {
+ "id": "A",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[STOP]",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START_SPEECH]",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START_SPEECH]",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[EXAGGERATION]",
+ "type_id": 1
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START]",
+ "type_id": 1
+ }
+ },
+ {
+ "Sequence": {
+ "id": "B",
+ "type_id": 1
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[STOP]",
+ "type_id": 1
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START_SPEECH]",
+ "type_id": 1
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "[START_SPEECH]",
+ "type_id": 1
+ }
+ }
],
"special_tokens": {
- "BOS": {
- "id": "BOS",
- "ids": [255],
- "tokens": [""]
+ "[START]": {
+ "id": "[START]",
+ "ids": [
+ 255
+ ],
+ "tokens": [
+ "[START]"
+ ]
},
- "EOS": {
- "id": "EOS",
- "ids": [0],
- "tokens": [""]
+ "[STOP]": {
+ "id": "[STOP]",
+ "ids": [
+ 0
+ ],
+ "tokens": [
+ "[STOP]"
+ ]
},
- "EXAGGERATION": {
- "id": "EXAGGERATION",
- "ids": [6563],
- "tokens": [""]
+ "[EXAGGERATION]": {
+ "id": "[EXAGGERATION]",
+ "ids": [
+ 6563
+ ],
+ "tokens": [
+ "[EXAGGERATION]"
+ ]
},
- "START_SPEECH": {
- "id": "START_SPEECH",
- "ids": [6561],
- "tokens": [""]
+ "[START_SPEECH]": {
+ "id": "[START_SPEECH]",
+ "ids": [
+ 6561
+ ],
+ "tokens": [
+ "[START_SPEECH]"
+ ]
}
}
},
- "decoder": null,
+ "decoder": {
+ "type": "Fuse"
+ },
"model": {
"type": "BPE",
"dropout": null,
@@ -3486,8 +3607,110 @@
"tch": 2348,
"sch": 2349,
"🙊": 2350,
- "🤭": 2351
- },
+ "🤭": 2351,
+ "€": 2352,
+ "أ": 2353,
+ "إ": 2354,
+ "ئ": 2355,
+ "آ": 2356,
+ "ؤ": 2357,
+ "ﻻ": 2358,
+ "ﺃ": 2359,
+ "ę": 2360,
+ "ą": 2361,
+ "ż": 2362,
+ "ś": 2363,
+ "ć": 2364,
+ "ń": 2365,
+ "ź": 2366,
+ "Ś": 2367,
+ "Ź": 2368,
+ "Ż": 2369,
+ "Ć": 2370,
+ "Š": 2371,
+ "Ő": 2372,
+ "й": 2373,
+ "ё": 2374,
+ "Й": 2375,
+ "Ё": 2376,
+ "が": 2377,
+ "で": 2378,
+ "じ": 2379,
+ "だ": 2380,
+ "ど": 2381,
+ "ば": 2382,
+ "げ": 2383,
+ "ご": 2384,
+ "ぶ": 2385,
+ "ぎ": 2386,
+ ",": 2387,
+ "(": 2388,
+ ":": 2389,
+ ";": 2390,
+ "?": 2391,
+ "!": 2392,
+ "#": 2393,
+ " )": 2394,
+ "ά": 2395,
+ "ό": 2396,
+ "ί": 2397,
+ "έ": 2398,
+ "ή": 2399,
+ "ύ": 2400,
+ "ώ": 2401,
+ "Έ": 2402,
+ "Ό": 2403,
+ "Ή": 2404,
+ "ž": 2405,
+ "š": 2406,
+ "ū": 2407,
+ "ş": 2408,
+ "Ō": 2409,
+ "ī": 2410,
+ "č": 2411,
+ "ř": 2412,
+ "ă": 2413,
+ "이": 2414,
+ "기": 2415,
+ "요": 2416,
+ "에": 2417,
+ "다": 2418,
+ "을": 2419,
+ "은": 2420,
+ "서": 2421,
+ "니": 2422,
+ "어": 2423,
+ "ě": 2424,
+ "ů": 2425,
+ "Č": 2426,
+ "ň": 2427,
+ "ď": 2428,
+ "ť": 2429,
+ "♭": 2430,
+ "ľ": 2431,
+ "ĺ": 2432,
+ "ğ": 2433,
+ "İ": 2434,
+ "Ş": 2435,
+ "ड़": 2436,
+ "ढ़": 2437,
+ "ज़": 2438,
+ "फ़": 2439,
+ "ख़": 2440,
+ "क़": 2441,
+ "ग़": 2442,
+ "Ά": 2443,
+ "ϊ": 2444,
+ "Ί": 2445,
+ "Ύ": 2446,
+ "Ώ": 2447,
+ "ΐ": 2448,
+ "ϋ": 2449,
+ "ũ": 2450,
+ "ụ": 2451,
+ "ọ": 2452,
+ "ạ": 2453
+ },
"merges": [
"t h",
"i n",