From 807d9e22a84f48f65f5e1060faa235008f54216e Mon Sep 17 00:00:00 2001 From: ai-modelscope Date: Sat, 7 Jun 2025 00:12:29 +0800 Subject: [PATCH] Automatically add EOS via Tokenizer, integrate Sentence Transformers (#1) - Automatically add EOS via Tokenizer, integrate Sentence Transformers (fd17b9cd89d6cc5b416d4b66ea25da0bea7f2bb0) - Remove eod_id line from README (7bd6fbe3c54b9ec2b4b1cc3a052720a76fcf0d90) --- .gitattributes | 3 +- 1_Pooling/config.json | 10 +++++ README.md | 65 +++++++++++++++++++++++++------ config_sentence_transformers.json | 8 ++++ modules.json | 20 ++++++++++ tokenizer.json | 4 +- 6 files changed, 95 insertions(+), 15 deletions(-) create mode 100644 1_Pooling/config.json create mode 100644 config_sentence_transformers.json create mode 100644 modules.json diff --git a/.gitattributes b/.gitattributes index 21b3632..f202261 100644 --- a/.gitattributes +++ b/.gitattributes @@ -45,5 +45,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.wasm filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text - -tokenizer.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/1_Pooling/config.json b/1_Pooling/config.json new file mode 100644 index 0000000..c5dd8df --- /dev/null +++ b/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 4096, + "pooling_mode_cls_token": false, + "pooling_mode_mean_tokens": false, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": true, + "include_prompt": true +} \ No newline at end of file diff --git a/README.md b/README.md index ab46a16..977e42e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,11 @@ license: apache-2.0 base_model: - Qwen/Qwen3-8B-Base -library_name: transformers +tags: +- transformers +- sentence-transformers +- sentence-similarity +- feature-extraction --- # Qwen3-Embedding-8B @@ -53,6 +57,47 @@ With Transformers versions earlier than 4.51.0, you may encounter the following KeyError: 'qwen3' ``` +### Sentence Transformers Usage + +```python +# Requires transformers>=4.51.0 + +from sentence_transformers import SentenceTransformer + +# Load the model +model = SentenceTransformer("Qwen/Qwen3-Embedding-8B") + +# We recommend enabling flash_attention_2 for better acceleration and memory saving, +# together with setting `padding_side` to "left": +# model = SentenceTransformer( +# "Qwen/Qwen3-Embedding-8B", +# model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"}, +# tokenizer_kwargs={"padding_side": "left"}, +# ) + +# The queries and documents to embed +queries = [ + "What is the capital of China?", + "Explain gravity", +] +documents = [ + "The capital of China is Beijing.", + "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.", +] + +# Encode the queries and documents. Note that queries benefit from using a prompt +# Here we use the prompt called "query" stored under `model.prompts`, but you can +# also pass your own prompt via the `prompt` argument +query_embeddings = model.encode(queries, prompt_name="query") +document_embeddings = model.encode(documents) + +# Compute the (cosine) similarity between the query and document embeddings +similarity = model.similarity(query_embeddings, document_embeddings) +print(similarity) +# tensor([[0.7493, 0.0751], +# [0.0880, 0.6318]]) +``` + ### Transformers Usage ```python @@ -79,14 +124,6 @@ def last_token_pool(last_hidden_states: Tensor, def get_detailed_instruct(task_description: str, query: str) -> str: return f'Instruct: {task_description}\nQuery:{query}' -def tokenize(tokenizer, input_texts, eod_id, max_length): - batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2) - for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]): - seq.append(eod_id) - att.append(1) - batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt") - return batch_dict - # Each query must come with a one-sentence instruction that describes the task task = 'Given a web search query, retrieve relevant passages that answer the query' @@ -107,11 +144,16 @@ model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-8B') # We recommend enabling flash_attention_2 for better acceleration and memory saving. # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-8B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda() -eod_id = tokenizer.convert_tokens_to_ids("<|endoftext|>") max_length = 8192 # Tokenize the input texts -batch_dict = tokenize(tokenizer, input_texts, eod_id, max_length) +batch_dict = tokenizer( + input_texts, + padding=True, + truncation=True, + max_length=max_length, + return_tensors="pt", +) batch_dict.to(model.device) outputs = model(**batch_dict) embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask']) @@ -120,6 +162,7 @@ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_ma embeddings = F.normalize(embeddings, p=2, dim=1) scores = (embeddings[:2] @ embeddings[2:].T) print(scores.tolist()) +# [[0.7493016123771667, 0.0750647559762001], [0.08795969933271408, 0.6318399906158447]] ``` 📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%. diff --git a/config_sentence_transformers.json b/config_sentence_transformers.json new file mode 100644 index 0000000..76aef3a --- /dev/null +++ b/config_sentence_transformers.json @@ -0,0 +1,8 @@ +{ + "prompts": { + "query": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:", + "document": "" + }, + "default_prompt_name": null, + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/modules.json b/modules.json new file mode 100644 index 0000000..952a9b8 --- /dev/null +++ b/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json index 51ebb3b..982862d 100644 --- a/tokenizer.json +++ b/tokenizer.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa -size 11421896 +oid sha256:83cdf8c3a34f68862319cb1810ee7b1e2c0a44e0864ae930194ddb76bb7feb8d +size 11422947