Textrl是一個Python圖書館,旨在使用強化學習來改善文本生成,並在擁抱Face的Transformers,PFRL和OpenAi Gym上進行建設。 Textrl設計為易於自定義,可以應用於各種文本生成模型。

Textrl利用強化學習來微調文本生成模型。它建立在以下庫上:
gpt2 import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
checkpoint = "gpt2"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 ,
repetition_penalty = 2 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))flan-t5COLAB示例:Google/flan-t5基本
import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForSeq2SeqLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
tokenizer = AutoTokenizer . from_pretrained ( "google/flan-t5-base" )
model = AutoModelForSeq2SeqLM . from_pretrained ( "google/flan-t5-base" )
model . eval ()
model . cuda ()
sentiment = pipeline ( 'sentiment-analysis' , model = "cardiffnlp/twitter-roberta-base-sentiment" , tokenizer = "cardiffnlp/twitter-roberta-base-sentiment" , device = 0 , return_all_scores = True )
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = 0
if finish or len ( predicted_list [ 0 ]) >= self . env_max_length :
predicted_text = tokenizer . convert_tokens_to_string ( predicted_list [ 0 ])
# sentiment classifier
reward = sentiment ( input_item [ 'input' ] + predicted_text )[ 0 ][ 0 ][ 'score' ] * 10
return reward
observaton_list = [{ 'input' : 'i think dogecoin is' }]
env = MyRLEnv ( model , tokenizer , observation_input = observaton_list , compare_sample = 1 )
actor = TextRLActor ( env , model , tokenizer , optimizer = 'adamw' ,
temperature = 0.8 ,
top_k = 100 ,
top_p = 0.85 ,)
agent = actor . agent_ppo ( update_interval = 50 , minibatch_size = 3 , epochs = 10 , lr = 3e-4 )
print ( actor . predict ( observaton_list [ 0 ]))
pfrl . experiments . train_agent_with_evaluation (
agent ,
env ,
steps = 3000 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
train_max_episode_len = 100 ,
eval_interval = 10 ,
outdir = 'checkpoint' ,
)
agent . load ( "./checkpoint/best" )
print ( actor . predict ( observaton_list [ 0 ]))bigscience/bloomz-7b1-mt import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))強烈建議為公共群體貢獻以提高花瓣的能力
https://github.com/bigscience-workshop/petals
安裝pip install petals -U首先
import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import BloomTokenizerFast
from petals import DistributedBloomForCausalLM
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
MODEL_NAME = "bigscience/bloom-petals"
tokenizer = BloomTokenizerFast . from_pretrained ( MODEL_NAME )
model = DistributedBloomForCausalLM . from_pretrained ( MODEL_NAME )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))[可通過RL控制生成,讓Elon Musk對Doge說話](https://github.com/voidful/textrl/blob/main/main/main/main/example/2022-12-12-10-textrl-elon-musk.ipynb)
COLAB示例:Bigscience/Bloom-560m
COLAB EXMAPLE:HuggingTweets/Elonmusk
在之前: i think dogecoin is a great idea.
之後: i think dogecoin is a great idea, but I think it is a little overused.
pip install pfrl@git+https://github.com/voidful/pfrl.git
pip install textrlGit克隆和CD進入該項目。
pip install -e . import torch
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda () class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ):
if finish :
reward = [ 0 ] # calculate reward score based on predicted_list
return rewardobservation_list應該是模型培訓的所有可能輸入字符串的列表
示例: observation_list = [{"input":'testing sent 1'},{"input":'testing sent 2'}]
env = MyRLEnv ( model , tokenizer , observation_input = observation_list )
actor = TextRLActor ( env , model , tokenizer )
agent = actor . agent_ppo ( update_interval = 10 , minibatch_size = 2000 , epochs = 20 ) n_episodes = 1000
max_episode_len = 200 # max sentence length
for i in range ( 1 , n_episodes + 1 ):
obs = env . reset ()
R = 0
t = 0
while True :
action = agent . act ( obs )
obs , reward , done , pred = env . step ( action )
R += reward
t += 1
reset = t == max_episode_len
agent . observe ( obs , reward , done , reset )
if done or reset :
break
if i % 10 == 0 :
print ( 'episode:' , i , 'R:' , R )
if i % 50 == 0 :
print ( 'statistics:' , agent . get_statistics ())
print ( 'Finished.' )訓練的另一種方法:
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
train_agent_with_evaluation (
agent ,
env ,
steps = 1000 ,
eval_n_steps = None ,
eval_n_episodes = 1500 ,
train_max_episode_len = 50 ,
eval_interval = 10000 ,
outdir = 'somewhere' ,
) agent . load ( "somewhere/best" ) # loading the best model
actor . predict ( "input text" )此更新的用法部分提供了有關如何初始化代理和環境,為環境設置獎勵功能,準備培訓,訓練模型並做出預測的綜合指南。它還包括一種使用train_agent_with_evaluation函數訓練模型的替代方法。
textrl-dump --model ./model_path_before_rl --rl ./rl_path --dump ./output_dir要使用RL進行語言模型,您需要修改獎勵功能:
from textrl import TextRLEnv
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ):
# input_item is the prompt input for the model, it will be one of your observation
# an observation will be a list of sentence of eg: ['inputted sentence','xxx','yyy']
# only the first input will feed to the model 'inputted sentence', and
# the remaining can be the reference for reward calculation
# predicted_list is the list of predicted sentences of RL model generated,
# it will be used for ranking reward calculation
# finish is the end of sentences flags, get_reward will be called during generating each word, and
# when finish is True, it means the sentence is finished, it will use for sentence level reward calculation.
# reward should be the list equal to the length of predicted_list
return reward抽樣的參數多種示例:
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False , # select the max probability token for each step or not
temperature = 1 , # temperature for sampling
compare_sample = 2 , # num of sample to rank
top_k = 0 , # top k sampling
top_p = 1.0 ,) # top p sampling在訓練增強學習(RL)模型時,需要調整幾個關鍵參數,以確保最佳性能。這是重要參數及其描述的列表:
update_interval = 10 minibatch_size = 2000 epochs = 20 gamma = 0.99 lr = 1e-4 epsilon = 0.2 entropy_coef = 0.01 steps = 1000 eval_interval = 10000 train_max_episode_len = 50這些參數需要根據特定的問題和環境仔細調整,以實現最佳性能。通常建議從默認值開始,然後根據觀察到的學習行為對其進行調整。