Textrl est une bibliothèque Python qui vise à améliorer la génération de texte en utilisant l'apprentissage en renforcement, en s'appuyant sur les étreintes de Transformers de Face, PFRL et Openai Gym. Textrl est conçu pour être facilement personnalisable et peut être appliqué à divers modèles de génération de texte.

Textrl utilise l'apprentissage du renforcement à affiner les modèles de génération de texte. Il est construit sur les bibliothèques suivantes:
gpt2 import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
checkpoint = "gpt2"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 ,
repetition_penalty = 2 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))flan-t5Exemple de Colab: Google / Flan-T5-base
import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForSeq2SeqLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
tokenizer = AutoTokenizer . from_pretrained ( "google/flan-t5-base" )
model = AutoModelForSeq2SeqLM . from_pretrained ( "google/flan-t5-base" )
model . eval ()
model . cuda ()
sentiment = pipeline ( 'sentiment-analysis' , model = "cardiffnlp/twitter-roberta-base-sentiment" , tokenizer = "cardiffnlp/twitter-roberta-base-sentiment" , device = 0 , return_all_scores = True )
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = 0
if finish or len ( predicted_list [ 0 ]) >= self . env_max_length :
predicted_text = tokenizer . convert_tokens_to_string ( predicted_list [ 0 ])
# sentiment classifier
reward = sentiment ( input_item [ 'input' ] + predicted_text )[ 0 ][ 0 ][ 'score' ] * 10
return reward
observaton_list = [{ 'input' : 'i think dogecoin is' }]
env = MyRLEnv ( model , tokenizer , observation_input = observaton_list , compare_sample = 1 )
actor = TextRLActor ( env , model , tokenizer , optimizer = 'adamw' ,
temperature = 0.8 ,
top_k = 100 ,
top_p = 0.85 ,)
agent = actor . agent_ppo ( update_interval = 50 , minibatch_size = 3 , epochs = 10 , lr = 3e-4 )
print ( actor . predict ( observaton_list [ 0 ]))
pfrl . experiments . train_agent_with_evaluation (
agent ,
env ,
steps = 3000 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
train_max_episode_len = 100 ,
eval_interval = 10 ,
outdir = 'checkpoint' ,
)
agent . load ( "./checkpoint/best" )
print ( actor . predict ( observaton_list [ 0 ]))bigscience/bloomz-7b1-mt import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))Recommande fortement contribuer à l'essaim public pour augmenter la capacité des pétales
https://github.com/bigscience-workshop/Petals
Installer pip install petals -U d'abord
import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import BloomTokenizerFast
from petals import DistributedBloomForCausalLM
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
MODEL_NAME = "bigscience/bloom-petals"
tokenizer = BloomTokenizerFast . from_pretrained ( MODEL_NAME )
model = DistributedBloomForCausalLM . from_pretrained ( MODEL_NAME )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))[Génération contrôlable via RL pour laisser Elon Musk parler de mal de Doge] (https://github.com/voidful/textrl/blob/main/example/2022-12-10-TExtrl-elon-Musk.ipynb)
Colab Exemple: BigScience / Bloom-560m
Colab Exmaple: Huggingtweets / Elonmusk
Avant: i think dogecoin is a great idea.
Après: i think dogecoin is a great idea, but I think it is a little overused.
pip install pfrl@git+https://github.com/voidful/pfrl.git
pip install textrlGit Clone et CD dans ce projet.
pip install -e . import torch
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda () class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ):
if finish :
reward = [ 0 ] # calculate reward score based on predicted_list
return rewardObservation_List devrait être une liste de toutes les chaînes d'entrée possibles pour la formation du modèle
Exemple: observation_list = [{"input":'testing sent 1'},{"input":'testing sent 2'}]
env = MyRLEnv ( model , tokenizer , observation_input = observation_list )
actor = TextRLActor ( env , model , tokenizer )
agent = actor . agent_ppo ( update_interval = 10 , minibatch_size = 2000 , epochs = 20 ) n_episodes = 1000
max_episode_len = 200 # max sentence length
for i in range ( 1 , n_episodes + 1 ):
obs = env . reset ()
R = 0
t = 0
while True :
action = agent . act ( obs )
obs , reward , done , pred = env . step ( action )
R += reward
t += 1
reset = t == max_episode_len
agent . observe ( obs , reward , done , reset )
if done or reset :
break
if i % 10 == 0 :
print ( 'episode:' , i , 'R:' , R )
if i % 50 == 0 :
print ( 'statistics:' , agent . get_statistics ())
print ( 'Finished.' )Une autre façon de s'entraîner:
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
train_agent_with_evaluation (
agent ,
env ,
steps = 1000 ,
eval_n_steps = None ,
eval_n_episodes = 1500 ,
train_max_episode_len = 50 ,
eval_interval = 10000 ,
outdir = 'somewhere' ,
) agent . load ( "somewhere/best" ) # loading the best model
actor . predict ( "input text" ) Cette section d'utilisation mise à jour fournit un guide complet sur la façon d'initialiser l'agent et l'environnement, de configurer la fonction de récompense pour l'environnement, de se préparer à la formation, de former le modèle et de faire des prédictions. Il comprend également une autre façon de former le modèle à l'aide de la fonction train_agent_with_evaluation .
textrl-dump --model ./model_path_before_rl --rl ./rl_path --dump ./output_dirPour Finetune un modèle de langue utilisant RL, vous devez modifier la fonction de récompense:
from textrl import TextRLEnv
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ):
# input_item is the prompt input for the model, it will be one of your observation
# an observation will be a list of sentence of eg: ['inputted sentence','xxx','yyy']
# only the first input will feed to the model 'inputted sentence', and
# the remaining can be the reference for reward calculation
# predicted_list is the list of predicted sentences of RL model generated,
# it will be used for ranking reward calculation
# finish is the end of sentences flags, get_reward will be called during generating each word, and
# when finish is True, it means the sentence is finished, it will use for sentence level reward calculation.
# reward should be the list equal to the length of predicted_list
return rewardParamètres pour échantillonner divers exemples:
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False , # select the max probability token for each step or not
temperature = 1 , # temperature for sampling
compare_sample = 2 , # num of sample to rank
top_k = 0 , # top k sampling
top_p = 1.0 ,) # top p samplingLors de la formation d'un modèle d'apprentissage par renforcement (RL), plusieurs paramètres clés doivent être réglés pour assurer des performances optimales. Voici une liste de paramètres importants et de leurs descriptions:
update_interval = 10 minibatch_size = 2000 epochs = 20 gamma = 0.99 lr = 1e-4 epsilon = 0.2 entropy_coef = 0.01 steps = 1000 eval_interval = 10000 train_max_episode_len = 50Ces paramètres doivent être soigneusement réglés en fonction du problème et de l'environnement spécifiques pour obtenir les meilleures performances. Il est généralement recommandé de commencer par des valeurs par défaut, puis de les ajuster en fonction du comportement d'apprentissage observé.