Automatically add EOS via Tokenizer, integrate Sentence Transformers (#1)

- Automatically add EOS via Tokenizer, integrate Sentence Transformers (fd17b9cd89d6cc5b416d4b66ea25da0bea7f2bb0) - Remove eod_id line from README (7bd6fbe3c54b9ec2b4b1cc3a052720a76fcf0d90)
2025-06-07 00:12:29 +08:00 · 2025-06-07 00:12:29 +08:00 · 807d9e22a8
commit 807d9e22a8
parent 9b8853c96a
6 changed files with 95 additions and 15 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -45,5 +45,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/1_Pooling/config.json
+++ b/1_Pooling/config.json
@ -0,0 +1,10 @@
 {
    "word_embedding_dimension": 4096,
    "pooling_mode_cls_token": false,
    "pooling_mode_mean_tokens": false,
    "pooling_mode_max_tokens": false,
    "pooling_mode_mean_sqrt_len_tokens": false,
    "pooling_mode_weightedmean_tokens": false,
    "pooling_mode_lasttoken": true,
    "include_prompt": true
 }
--- a/README.md
+++ b/README.md
@ -2,7 +2,11 @@
 license: apache-2.0
 base_model:
 - Qwen/Qwen3-8B-Base
-library_name: transformers
+tags:
 - transformers
 - sentence-transformers
 - sentence-similarity
 - feature-extraction
 ---
 # Qwen3-Embedding-8B
@ -53,6 +57,47 @@ With Transformers versions earlier than 4.51.0, you may encounter the following
 KeyError: 'qwen3'
 ```
 ### Sentence Transformers Usage
 ```python
 # Requires transformers>=4.51.0
 from sentence_transformers import SentenceTransformer
 # Load the model
 model = SentenceTransformer("Qwen/Qwen3-Embedding-8B")
 # We recommend enabling flash_attention_2 for better acceleration and memory saving,
 # together with setting `padding_side` to "left":
 # model = SentenceTransformer(
 #     "Qwen/Qwen3-Embedding-8B",
 #     model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
 #     tokenizer_kwargs={"padding_side": "left"},
 # )
 # The queries and documents to embed
 queries = [
    "What is the capital of China?",
    "Explain gravity",
 ]
 documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
 ]
 # Encode the queries and documents. Note that queries benefit from using a prompt
 # Here we use the prompt called "query" stored under `model.prompts`, but you can
 # also pass your own prompt via the `prompt` argument
 query_embeddings = model.encode(queries, prompt_name="query")
 document_embeddings = model.encode(documents)
 # Compute the (cosine) similarity between the query and document embeddings
 similarity = model.similarity(query_embeddings, document_embeddings)
 print(similarity)
 # tensor([[0.7493, 0.0751],
 #         [0.0880, 0.6318]])
 ```
 ### Transformers Usage
 ```python
@ -79,14 +124,6 @@ def last_token_pool(last_hidden_states: Tensor,
 def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'
 def tokenize(tokenizer, input_texts, eod_id, max_length):
    batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
    for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
        seq.append(eod_id)
        att.append(1)
    batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
    return batch_dict
 # Each query must come with a one-sentence instruction that describes the task
 task = 'Given a web search query, retrieve relevant passages that answer the query'
@ -107,11 +144,16 @@ model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-8B')
 # We recommend enabling flash_attention_2 for better acceleration and memory saving.
 # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-8B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
 eod_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
 max_length = 8192
 # Tokenize the input texts
-batch_dict = tokenize(tokenizer, input_texts, eod_id, max_length)
+batch_dict = tokenizer(
    input_texts,
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt",
 )
 batch_dict.to(model.device)
 outputs = model(**batch_dict)
 embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
@ -120,6 +162,7 @@ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_ma
 embeddings = F.normalize(embeddings, p=2, dim=1)
 scores = (embeddings[:2] @ embeddings[2:].T)
 print(scores.tolist())
 # [[0.7493016123771667, 0.0750647559762001], [0.08795969933271408, 0.6318399906158447]]
 ```
 📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.
--- a/config_sentence_transformers.json
+++ b/config_sentence_transformers.json
@ -0,0 +1,8 @@
 {
  "prompts": {
    "query": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:",
    "document": ""
  },
  "default_prompt_name": null,
  "similarity_fn_name": "cosine"
 }
--- a/modules.json
+++ b/modules.json
@ -0,0 +1,20 @@
 [
  {
    "idx": 0,
    "name": "0",
    "path": "",
    "type": "sentence_transformers.models.Transformer"
  },
  {
    "idx": 1,
    "name": "1",
    "path": "1_Pooling",
    "type": "sentence_transformers.models.Pooling"
  },
  {
    "idx": 2,
    "name": "2",
    "path": "2_Normalize",
    "type": "sentence_transformers.models.Normalize"
  }
 ]
--- a/tokenizer.json
+++ b/tokenizer.json