灵活的培训和检索晚期互动模型

Pylate是一个建立在句子变压器顶部的库,旨在通过最先进的Colbert模型简化和优化微调,推理和检索。它可以轻松地对单个GPU进行微调,从而为各种硬件设置提供了灵活性。 Pylate还简化了文档的检索,并使您可以加载各种模型,从而使您能够从大多数预训练的语言模型中构建Colbert模型。
您可以使用PIP安装甲酸:
pip install pylate对于评估依赖性,请使用:
pip install " pylate[eval] " 完整的文档可在此处获得,其中包括深入指南,示例和API参考。
这是使用Pylate在MS MARCO数据集三重态数据集上训练Colbert模型的一个简单示例。该脚本表明培训具有对比性损失,并在持有的评估集中评估了模型:
import torch
from datasets import load_dataset
from sentence_transformers import (
SentenceTransformerTrainer ,
SentenceTransformerTrainingArguments ,
)
from pylate import evaluation , losses , models , utils
# Define model parameters for contrastive training
model_name = "bert-base-uncased" # Choose the pre-trained model you want to use as base
batch_size = 32 # Larger batch size often improves results, but requires more memory
num_train_epochs = 1 # Adjust based on your requirements
# Set the run name for logging and output directory
run_name = "contrastive-bert-base-uncased"
output_dir = f"output/ { run_name } "
# 1. Here we define our ColBERT model. If not a ColBERT model, will add a linear layer to the base encoder.
model = models . ColBERT ( model_name_or_path = model_name )
# Compiling the model makes the training faster
model = torch . compile ( model )
# Load dataset
dataset = load_dataset ( "sentence-transformers/msmarco-bm25" , "triplet" , split = "train" )
# Split the dataset (this dataset does not have a validation set, so we split the training set)
splits = dataset . train_test_split ( test_size = 0.01 )
train_dataset = splits [ "train" ]
eval_dataset = splits [ "test" ]
# Define the loss function
train_loss = losses . Contrastive ( model = model )
# Initialize the evaluator
dev_evaluator = evaluation . ColBERTTripletEvaluator (
anchors = eval_dataset [ "query" ],
positives = eval_dataset [ "positive" ],
negatives = eval_dataset [ "negative" ],
)
# Configure the training arguments (e.g., batch size, evaluation strategy, logging steps)
args = SentenceTransformerTrainingArguments (
output_dir = output_dir ,
num_train_epochs = num_train_epochs ,
per_device_train_batch_size = batch_size ,
per_device_eval_batch_size = batch_size ,
fp16 = True , # Set to False if you get an error that your GPU can't run on FP16
bf16 = False , # Set to True if you have a GPU that supports BF16
run_name = run_name , # Will be used in W&B if `wandb` is installed
learning_rate = 3e-6 ,
)
# Initialize the trainer for the contrastive training
trainer = SentenceTransformerTrainer (
model = model ,
args = args ,
train_dataset = train_dataset ,
eval_dataset = eval_dataset ,
loss = train_loss ,
evaluator = dev_evaluator ,
data_collator = utils . ColBERTCollator ( model . tokenize ),
)
# Start the training process
trainer . train ()训练后,可以使用输出目录路径加载模型:
from pylate import models
model = models . ColBERT ( model_name_or_path = "contrastive-bert-base-uncased" )为了在训练Colbert模型时获得最佳性能,您应该使用知识蒸馏使用强大的教师模型的分数来训练模型。这是一个简单的示例,说明如何使用MSCO上的佩莱特(Pylate)中的知识蒸馏训练模型:
import torch
from datasets import load_dataset
from sentence_transformers import (
SentenceTransformerTrainer ,
SentenceTransformerTrainingArguments ,
)
from pylate import losses , models , utils
# Load the datasets required for knowledge distillation (train, queries, documents)
train = load_dataset (
path = "lightonai/ms-marco-en-bge" ,
name = "train" ,
)
queries = load_dataset (
path = "lightonai/ms-marco-en-bge" ,
name = "queries" ,
)
documents = load_dataset (
path = "lightonai/ms-marco-en-bge" ,
name = "documents" ,
)
# Set the transformation to load the documents/queries texts using the corresponding ids on the fly
train . set_transform (
utils . KDProcessing ( queries = queries , documents = documents ). transform ,
)
# Define the base model, training parameters, and output directory
model_name = "bert-base-uncased" # Choose the pre-trained model you want to use as base
batch_size = 16
num_train_epochs = 1
# Set the run name for logging and output directory
run_name = "knowledge-distillation-bert-base"
output_dir = f"output/ { run_name } "
# Initialize the ColBERT model from the base model
model = models . ColBERT ( model_name_or_path = model_name )
# Compiling the model to make the training faster
model = torch . compile ( model )
# Configure the training arguments (e.g., epochs, batch size, learning rate)
args = SentenceTransformerTrainingArguments (
output_dir = output_dir ,
num_train_epochs = num_train_epochs ,
per_device_train_batch_size = batch_size ,
fp16 = True , # Set to False if you get an error that your GPU can't run on FP16
bf16 = False , # Set to True if you have a GPU that supports BF16
run_name = run_name ,
learning_rate = 1e-5 ,
)
# Use the Distillation loss function for training
train_loss = losses . Distillation ( model = model )
# Initialize the trainer
trainer = SentenceTransformerTrainer (
model = model ,
args = args ,
train_dataset = train ,
loss = train_loss ,
data_collator = utils . ColBERTCollator ( tokenize_fn = model . tokenize ),
)
# Start the training process
trainer . train ()佩蒂特支持拥抱面部数据集,从而实现基于基于知识蒸馏的无缝三胞胎 /知识。对于对比度培训,您可以使用任何现有的句子变形金刚三重态数据集。以下是创建自定义三重态数据集以进行培训的示例:
from datasets import Dataset
dataset = [
{
"query" : "example query 1" ,
"positive" : "example positive document 1" ,
"negative" : "example negative document 1" ,
},
{
"query" : "example query 2" ,
"positive" : "example positive document 2" ,
"negative" : "example negative document 2" ,
},
{
"query" : "example query 3" ,
"positive" : "example positive document 3" ,
"negative" : "example negative document 3" ,
},
]
dataset = Dataset . from_list ( mapping = dataset )
train_dataset , test_dataset = dataset . train_test_split ( test_size = 0.3 )要创建知识蒸馏数据集,您可以使用以下片段:
from datasets import Dataset
dataset = [
{
"query_id" : 54528 ,
"document_ids" : [
6862419 ,
335116 ,
339186 ,
],
"scores" : [
0.4546215673141326 ,
0.6575686537173476 ,
0.26825184192900203 ,
],
},
{
"query_id" : 749480 ,
"document_ids" : [
6862419 ,
335116 ,
339186 ,
],
"scores" : [
0.2546215673141326 ,
0.7575686537173476 ,
0.96825184192900203 ,
],
},
]
dataset = Dataset . from_list ( mapping = dataset )
documents = [
{ "document_id" : 6862419 , "text" : "example doc 1" },
{ "document_id" : 335116 , "text" : "example doc 2" },
{ "document_id" : 339186 , "text" : "example doc 3" },
]
queries = [
{ "query_id" : 749480 , "text" : "example query" },
]
documents = Dataset . from_list ( mapping = documents )
queries = Dataset . from_list ( mapping = queries )Pylate允许使用训练有素的Colbert模型和Voyager索引轻松检索给定查询集的顶级文档,只需加载模型并启动索引:
from pylate import indexes , models , retrieve
model = models . ColBERT (
model_name_or_path = "lightonai/colbertv2.0" ,
)
index = indexes . Voyager (
index_folder = "pylate-index" ,
index_name = "index" ,
override = True ,
)
retriever = retrieve . ColBERT ( index = index )设置模型和索引后,我们可以使用其嵌入和相应的ID将文档添加到索引中:
documents_ids = [ "1" , "2" , "3" ]
documents = [
"document 1 text" , "document 2 text" , "document 3 text"
]
# Encode the documents
documents_embeddings = model . encode (
documents ,
batch_size = 32 ,
is_query = False , # Encoding documents
show_progress_bar = True ,
)
# Add the documents ids and embeddings to the Voyager index
index . add_documents (
documents_ids = documents_ids ,
documents_embeddings = documents_embeddings ,
)然后,我们可以在给定的一组查询中检索Top-K文档:
queries_embeddings = model . encode (
[ "query for document 3" , "query for document 1" ],
batch_size = 32 ,
is_query = True , # Encoding queries
show_progress_bar = True ,
)
scores = retriever . retrieve (
queries_embeddings = queries_embeddings ,
k = 10 ,
)
print ( scores )样本输出:
[
[
{ "id" : "3" , "score" : 11.266985893249512 },
{ "id" : "1" , "score" : 10.303335189819336 },
{ "id" : "2" , "score" : 9.502392768859863 },
],
[
{ "id" : "1" , "score" : 10.88800048828125 },
{ "id" : "3" , "score" : 9.950843811035156 },
{ "id" : "2" , "score" : 9.602447509765625 },
],
]如果您只想使用Colbert模型在不构建索引的情况下在第一阶段检索管道之上执行重读,则可以简单地使用级别功能并将查询和文档传递给Rerank:
from pylate import rank
queries = [
"query A" ,
"query B" ,
]
documents = [
[ "document A" , "document B" ],
[ "document 1" , "document C" , "document B" ],
]
documents_ids = [
[ 1 , 2 ],
[ 1 , 3 , 2 ],
]
queries_embeddings = model . encode (
queries ,
is_query = True ,
)
documents_embeddings = model . encode (
documents ,
is_query = False ,
)
reranked_documents = rank . rerank (
documents_ids = documents_ids ,
queries_embeddings = queries_embeddings ,
documents_embeddings = documents_embeddings ,
)我们欢迎捐款!开始:
pip install " pylate[dev] "make testmake ruffmake livedoc您可以使用此Bibtex参考图书馆:
@misc { PyLate ,
title = { PyLate: Flexible Training and Retrieval for Late Interaction Models } ,
author = { Chaffin, Antoine and Sourty, Raphaël } ,
url = { https://github.com/lightonai/pylate } ,
year = { 2024 }
}