Automatically add EOS via Tokenizer, add Sentence Transformers snippet

#2
by tomaarsen HF Staff - opened
Files changed (2) hide show
  1. README.md +52 -7
  2. tokenizer.json +2 -2
README.md CHANGED
@@ -5,9 +5,11 @@ datasets:
5
  - codefuse-ai/F2LLM
6
  language:
7
  - en
 
 
8
  license: apache-2.0
9
  pipeline_tag: feature-extraction
10
- library_name: transformers
11
  ---
12
 
13
  # F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data
@@ -19,7 +21,38 @@ F2LLMs (Foundation to Feature Large Language Models) are foundation models direc
19
 
20
  ## Usage
21
 
22
- To encode a batch of sentences:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  ```python
25
  from transformers import AutoModel, AutoTokenizer
@@ -31,22 +64,34 @@ model_path = "codefuse-ai/F2LLM-1.7B"
31
  tokenizer = AutoTokenizer.from_pretrained(model_path)
32
  model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map={'': 0})
33
 
34
- sentences = [
 
 
35
  'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
36
- 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.'
 
37
  ]
38
 
39
  def encode(sentences):
40
  batch_size = len(sentences)
41
- sentences = [s+tokenizer.eos_token for s in sentences]
42
- tokenized_inputs = tokenizer(sentences, padding=True, return_tensors='pt', add_special_tokens=False).to(model.device)
43
  last_hidden_state = model(**tokenized_inputs).last_hidden_state
44
  eos_positions = tokenized_inputs.attention_mask.sum(dim=1) - 1
45
  embeddings = last_hidden_state[torch.arange(batch_size, device=model.device), eos_positions]
46
  embeddings = F.normalize(embeddings, p=2, dim=1)
47
  return embeddings
48
 
49
- embeddings = encode(sentences)
 
 
 
 
 
 
 
 
 
 
50
  ```
51
 
52
  ## Evaluation
 
5
  - codefuse-ai/F2LLM
6
  language:
7
  - en
8
+ tags:
9
+ - transformers
10
  license: apache-2.0
11
  pipeline_tag: feature-extraction
12
+ library_name: sentence-transformers
13
  ---
14
 
15
  # F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data
 
21
 
22
  ## Usage
23
 
24
+ ### With Sentence Transformers
25
+
26
+ To encode text using F2LLM with the [Sentence Transformers](https://www.sbert.net/) library:
27
+
28
+ ```python
29
+ from sentence_transformers import SentenceTransformer
30
+
31
+ model = SentenceTransformer("codefuse-ai/F2LLM-1.7B", model_kwargs={"torch_dtype": "bfloat16"})
32
+
33
+ # Some sample query and documents
34
+ query = "What is F2LLM used for?"
35
+ documents = [
36
+ 'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
37
+ 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.',
38
+ 'F2LLM is a model for computing text embeddings that can be used for various NLP tasks such as information retrieval, semantic search, and text classification.'
39
+ ]
40
+
41
+ # Encode the query and documents separately, the encode_query method uses the query prompt
42
+ query_embedding = model.encode_query(query)
43
+ document_embeddings = model.encode_document(documents)
44
+ print(query_embedding.shape, document_embeddings.shape)
45
+ # (2048,) (3, 2048)
46
+
47
+ # Compute cosine similarity between the query and documents
48
+ similarity = model.similarity(query_embedding, document_embeddings)
49
+ print(similarity)
50
+ # tensor([[0.5373, 0.6257, 0.8218]])
51
+ ```
52
+
53
+ ### With Transformers
54
+
55
+ Or directly with the [Transformers](https://huggingface.co/docs/transformers/index) library:
56
 
57
  ```python
58
  from transformers import AutoModel, AutoTokenizer
 
64
  tokenizer = AutoTokenizer.from_pretrained(model_path)
65
  model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map={'': 0})
66
 
67
+ query = "What is F2LLM used for?"
68
+ query_prompt = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:"
69
+ documents = [
70
  'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
71
+ 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.',
72
+ 'F2LLM is a model for computing text embeddings that can be used for various NLP tasks such as information retrieval, semantic search, and text classification.'
73
  ]
74
 
75
  def encode(sentences):
76
  batch_size = len(sentences)
77
+ tokenized_inputs = tokenizer(sentences, padding=True, return_tensors='pt').to(model.device)
 
78
  last_hidden_state = model(**tokenized_inputs).last_hidden_state
79
  eos_positions = tokenized_inputs.attention_mask.sum(dim=1) - 1
80
  embeddings = last_hidden_state[torch.arange(batch_size, device=model.device), eos_positions]
81
  embeddings = F.normalize(embeddings, p=2, dim=1)
82
  return embeddings
83
 
84
+ # Encode the query and documents
85
+ query_embedding = encode([query_prompt + query])
86
+ document_embeddings = encode(documents)
87
+ print(query_embedding.shape, document_embeddings.shape)
88
+ # torch.Size([1, 2048]) torch.Size([3, 2048])
89
+
90
+ # Compute cosine similarity between the query and documents
91
+ similarity = query_embedding @ document_embeddings.T
92
+ print(similarity)
93
+ # tensor([[0.5391, 0.6250, 0.8242]], device='cuda:0', dtype=torch.bfloat16,
94
+ # grad_fn=<MmBackward0>)
95
  ```
96
 
97
  ## Evaluation
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
- size 11422654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38360d5a512a43641b36d6fba2df87b8a3f5464c6b5c76f03e82d6d795175566
3
+ size 11423195