Textrl ist eine Python -Bibliothek, die darauf abzielt, die Textgenerierung mithilfe von Verstärkungslernen zu verbessern, auf den Transformers von Face, PFRL und Openai Gym zu rücken. Textrl ist so konzipiert, dass es leicht anpassbar ist und auf verschiedene Modelle für Textgenerationen angewendet werden kann.

Textrl verwendet Verstärkungslernen, um Modelle für die Erzeugung von Textgenerierung zu optimieren. Es basiert auf den folgenden Bibliotheken:
gpt2 import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
checkpoint = "gpt2"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 ,
repetition_penalty = 2 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))flan-t5Colab Beispiel: Google/Flan-T5-Base
import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForSeq2SeqLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
tokenizer = AutoTokenizer . from_pretrained ( "google/flan-t5-base" )
model = AutoModelForSeq2SeqLM . from_pretrained ( "google/flan-t5-base" )
model . eval ()
model . cuda ()
sentiment = pipeline ( 'sentiment-analysis' , model = "cardiffnlp/twitter-roberta-base-sentiment" , tokenizer = "cardiffnlp/twitter-roberta-base-sentiment" , device = 0 , return_all_scores = True )
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = 0
if finish or len ( predicted_list [ 0 ]) >= self . env_max_length :
predicted_text = tokenizer . convert_tokens_to_string ( predicted_list [ 0 ])
# sentiment classifier
reward = sentiment ( input_item [ 'input' ] + predicted_text )[ 0 ][ 0 ][ 'score' ] * 10
return reward
observaton_list = [{ 'input' : 'i think dogecoin is' }]
env = MyRLEnv ( model , tokenizer , observation_input = observaton_list , compare_sample = 1 )
actor = TextRLActor ( env , model , tokenizer , optimizer = 'adamw' ,
temperature = 0.8 ,
top_k = 100 ,
top_p = 0.85 ,)
agent = actor . agent_ppo ( update_interval = 50 , minibatch_size = 3 , epochs = 10 , lr = 3e-4 )
print ( actor . predict ( observaton_list [ 0 ]))
pfrl . experiments . train_agent_with_evaluation (
agent ,
env ,
steps = 3000 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
train_max_episode_len = 100 ,
eval_interval = 10 ,
outdir = 'checkpoint' ,
)
agent . load ( "./checkpoint/best" )
print ( actor . predict ( observaton_list [ 0 ]))bigscience/bloomz-7b1-mt import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))Empfehlen Sie dringend einen Beitrag zum öffentlichen Schwarm, um die Kapazität der Blütenblätter zu erhöhen
https://github.com/bigscience-workshop/petals
Installieren Sie pip install petals -U
import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import BloomTokenizerFast
from petals import DistributedBloomForCausalLM
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
MODEL_NAME = "bigscience/bloom-petals"
tokenizer = BloomTokenizerFast . from_pretrained ( MODEL_NAME )
model = DistributedBloomForCausalLM . from_pretrained ( MODEL_NAME )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))[Controllable Generation über RL, um Elon Musk krank von doge zu lassen] (https://github.com/voidful/textrl/blob/main/example/2022-12-10-textrl-elon-musk.ipynb)
Colab Beispiel: BigScience/Bloom-560m
Colab Exmaple: Huggingtweets/Elonmusk
Vorher: i think dogecoin is a great idea.
Nach: i think dogecoin is a great idea, but I think it is a little overused.
pip install pfrl@git+https://github.com/voidful/pfrl.git
pip install textrlGit -Klon und CD in dieses Projekt.
pip install -e . import torch
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda () class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ):
if finish :
reward = [ 0 ] # calculate reward score based on predicted_list
return rewardBeobachtung_List sollte eine Liste aller möglichen Eingabebestnen für das Modelltraining sein
Beispiel: observation_list = [{"input":'testing sent 1'},{"input":'testing sent 2'}]
env = MyRLEnv ( model , tokenizer , observation_input = observation_list )
actor = TextRLActor ( env , model , tokenizer )
agent = actor . agent_ppo ( update_interval = 10 , minibatch_size = 2000 , epochs = 20 ) n_episodes = 1000
max_episode_len = 200 # max sentence length
for i in range ( 1 , n_episodes + 1 ):
obs = env . reset ()
R = 0
t = 0
while True :
action = agent . act ( obs )
obs , reward , done , pred = env . step ( action )
R += reward
t += 1
reset = t == max_episode_len
agent . observe ( obs , reward , done , reset )
if done or reset :
break
if i % 10 == 0 :
print ( 'episode:' , i , 'R:' , R )
if i % 50 == 0 :
print ( 'statistics:' , agent . get_statistics ())
print ( 'Finished.' )Eine andere Möglichkeit zu trainieren:
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
train_agent_with_evaluation (
agent ,
env ,
steps = 1000 ,
eval_n_steps = None ,
eval_n_episodes = 1500 ,
train_max_episode_len = 50 ,
eval_interval = 10000 ,
outdir = 'somewhere' ,
) agent . load ( "somewhere/best" ) # loading the best model
actor . predict ( "input text" ) Dieser Abschnitt mit aktualisierter Nutzung bietet einen umfassenden Leitfaden zum Initialisieren des Agenten und der Umgebung, zur Einrichtung der Belohnungsfunktion für die Umwelt, zur Vorbereitung auf das Training, zum Ausbilden des Modells und zum Vorhersagen. Es enthält auch eine alternative Möglichkeit, das Modell mithilfe der Funktion train_agent_with_evaluation zu trainieren.
textrl-dump --model ./model_path_before_rl --rl ./rl_path --dump ./output_dirUm ein Sprachmodell mit RL zu beenden, müssen Sie die Belohnungsfunktion ändern:
from textrl import TextRLEnv
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ):
# input_item is the prompt input for the model, it will be one of your observation
# an observation will be a list of sentence of eg: ['inputted sentence','xxx','yyy']
# only the first input will feed to the model 'inputted sentence', and
# the remaining can be the reference for reward calculation
# predicted_list is the list of predicted sentences of RL model generated,
# it will be used for ranking reward calculation
# finish is the end of sentences flags, get_reward will be called during generating each word, and
# when finish is True, it means the sentence is finished, it will use for sentence level reward calculation.
# reward should be the list equal to the length of predicted_list
return rewardParameter für die Abtastung verschiedener Beispiele:
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False , # select the max probability token for each step or not
temperature = 1 , # temperature for sampling
compare_sample = 2 , # num of sample to rank
top_k = 0 , # top k sampling
top_p = 1.0 ,) # top p samplingBeim Training eines RL -Modells (Verstärkungslernen) müssen mehrere wichtige Parameter abgestimmt werden, um eine optimale Leistung zu gewährleisten. Hier finden Sie eine Liste wichtiger Parameter und deren Beschreibungen:
update_interval = 10 minibatch_size = 2000 epochs = 20 gamma = 0.99 lr = 1e-4 epsilon = 0.2 entropy_coef = 0.01 steps = 1000 eval_interval = 10000 train_max_episode_len = 50Diese Parameter müssen basierend auf dem spezifischen Problem und der Umgebung sorgfältig abgestimmt werden, um die beste Leistung zu erzielen. Es wird im Allgemeinen empfohlen, mit Standardwerten zu beginnen und sie dann anhand des beobachteten Lernverhaltens anzupassen.