From 97323c8c351fcaf88ad27330e765d25024b47eb7 Mon Sep 17 00:00:00 2001 From: Cherrytest Date: Tue, 30 Sep 2025 03:55:57 +0000 Subject: [PATCH] Upload tokenizer.json --- tokenizer.json | 337 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 280 insertions(+), 57 deletions(-) diff --git a/tokenizer.json b/tokenizer.json index 5998042..08c3696 100644 --- a/tokenizer.json +++ b/tokenizer.json @@ -21,15 +21,6 @@ "rstrip": false, "normalized": false }, - { - "id": 2, - "special": true, - "content": "[SPACE]", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": true - }, { "id": 255, "special": true, @@ -1064,69 +1055,199 @@ "rstrip": false, "normalized": false, "special": true + }, + { + "id": 6561, + "content": "[START_SPEECH]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6562, + "content": "[STOP_SPEECH]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6563, + "content": "[EXAGGERATION]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true } ], "normalizer": { - "type": "Sequence", - "normalizers": [ - { - "type": "Replace", - "pattern": { "String": " " }, - "content": "[SPACE]" - } - ] - }, - "pre_tokenizer": { - "type": "Whitespace" + "type": "Replace", + "pattern": { + "Regex": "\\s+" + }, + "content": " " }, + "pre_tokenizer": null, "post_processor": { "type": "TemplateProcessing", "single": [ - { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } }, - { "SpecialToken": { "id": "BOS", "type_id": 0 } }, - { "Sequence": { "id": "A", "type_id": 0 } }, - { "SpecialToken": { "id": "EOS", "type_id": 0 } }, - { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } }, - { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } } + { + "SpecialToken": { + "id": "[EXAGGERATION]", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[START]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[STOP]", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[START_SPEECH]", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[START_SPEECH]", + "type_id": 0 + } + } ], "pair": [ - { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } }, - { "SpecialToken": { "id": "BOS", "type_id": 0 } }, - { "Sequence": { "id": "A", "type_id": 0 } }, - { "SpecialToken": { "id": "EOS", "type_id": 0 } }, - { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } }, - { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } }, - { "SpecialToken": { "id": "EXAGGERATION", "type_id": 1 } }, - { "SpecialToken": { "id": "BOS", "type_id": 1 } }, - { "Sequence": { "id": "B", "type_id": 1 } }, - { "SpecialToken": { "id": "EOS", "type_id": 1 } }, - { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } }, - { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } } + { + "SpecialToken": { + "id": "[EXAGGERATION]", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[START]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[STOP]", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[START_SPEECH]", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[START_SPEECH]", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[EXAGGERATION]", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "[START]", + "type_id": 1 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "[STOP]", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "[START_SPEECH]", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "[START_SPEECH]", + "type_id": 1 + } + } ], "special_tokens": { - "BOS": { - "id": "BOS", - "ids": [255], - "tokens": [""] + "[START]": { + "id": "[START]", + "ids": [ + 255 + ], + "tokens": [ + "[START]" + ] }, - "EOS": { - "id": "EOS", - "ids": [0], - "tokens": [""] + "[STOP]": { + "id": "[STOP]", + "ids": [ + 0 + ], + "tokens": [ + "[STOP]" + ] }, - "EXAGGERATION": { - "id": "EXAGGERATION", - "ids": [6563], - "tokens": [""] + "[EXAGGERATION]": { + "id": "[EXAGGERATION]", + "ids": [ + 6563 + ], + "tokens": [ + "[EXAGGERATION]" + ] }, - "START_SPEECH": { - "id": "START_SPEECH", - "ids": [6561], - "tokens": [""] + "[START_SPEECH]": { + "id": "[START_SPEECH]", + "ids": [ + 6561 + ], + "tokens": [ + "[START_SPEECH]" + ] } } }, - "decoder": null, + "decoder": { + "type": "Fuse" + }, "model": { "type": "BPE", "dropout": null, @@ -3486,8 +3607,110 @@ "tch": 2348, "sch": 2349, "🙊": 2350, - "🤭": 2351 - }, + "🤭": 2351, + "€": 2352, + "أ": 2353, + "إ": 2354, + "ئ": 2355, + "آ": 2356, + "ؤ": 2357, + "ﻻ": 2358, + "ﺃ": 2359, + "ę": 2360, + "ą": 2361, + "ż": 2362, + "ś": 2363, + "ć": 2364, + "ń": 2365, + "ź": 2366, + "Ś": 2367, + "Ź": 2368, + "Ż": 2369, + "Ć": 2370, + "Š": 2371, + "Ő": 2372, + "й": 2373, + "ё": 2374, + "Й": 2375, + "Ё": 2376, + "が": 2377, + "で": 2378, + "じ": 2379, + "だ": 2380, + "ど": 2381, + "ば": 2382, + "げ": 2383, + "ご": 2384, + "ぶ": 2385, + "ぎ": 2386, + ",": 2387, + "(": 2388, + ":": 2389, + ";": 2390, + "?": 2391, + "!": 2392, + "#": 2393, + " )": 2394, + "ά": 2395, + "ό": 2396, + "ί": 2397, + "έ": 2398, + "ή": 2399, + "ύ": 2400, + "ώ": 2401, + "Έ": 2402, + "Ό": 2403, + "Ή": 2404, + "ž": 2405, + "š": 2406, + "ū": 2407, + "ş": 2408, + "Ō": 2409, + "ī": 2410, + "č": 2411, + "ř": 2412, + "ă": 2413, + "이": 2414, + "기": 2415, + "요": 2416, + "에": 2417, + "다": 2418, + "을": 2419, + "은": 2420, + "서": 2421, + "니": 2422, + "어": 2423, + "ě": 2424, + "ů": 2425, + "Č": 2426, + "ň": 2427, + "ď": 2428, + "ť": 2429, + "♭": 2430, + "ľ": 2431, + "ĺ": 2432, + "ğ": 2433, + "İ": 2434, + "Ş": 2435, + "ड़": 2436, + "ढ़": 2437, + "ज़": 2438, + "फ़": 2439, + "ख़": 2440, + "क़": 2441, + "ग़": 2442, + "Ά": 2443, + "ϊ": 2444, + "Ί": 2445, + "Ύ": 2446, + "Ώ": 2447, + "ΐ": 2448, + "ϋ": 2449, + "ũ": 2450, + "ụ": 2451, + "ọ": 2452, + "ạ": 2453 + }, "merges": [ "t h", "i n",