Textrl은 강화 학습을 사용하여 텍스트 생성을 개선하고 포옹 얼굴의 변압기, PFRL 및 Openai Gym을 구축하는 것을 목표로하는 파이썬 라이브러리입니다. TexTrl은 쉽게 사용자 정의 할 수 있도록 설계되었으며 다양한 텍스트 생성 모델에 적용 할 수 있습니다.

Textrl은 강화 학습을 사용하여 텍스트 생성 모델을 미세 조정합니다. 다음 라이브러리에 구축됩니다.
gpt2 import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
checkpoint = "gpt2"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 ,
repetition_penalty = 2 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))flan-t5Colab 예 : Google/Flan-T5-Base
import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForSeq2SeqLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
tokenizer = AutoTokenizer . from_pretrained ( "google/flan-t5-base" )
model = AutoModelForSeq2SeqLM . from_pretrained ( "google/flan-t5-base" )
model . eval ()
model . cuda ()
sentiment = pipeline ( 'sentiment-analysis' , model = "cardiffnlp/twitter-roberta-base-sentiment" , tokenizer = "cardiffnlp/twitter-roberta-base-sentiment" , device = 0 , return_all_scores = True )
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = 0
if finish or len ( predicted_list [ 0 ]) >= self . env_max_length :
predicted_text = tokenizer . convert_tokens_to_string ( predicted_list [ 0 ])
# sentiment classifier
reward = sentiment ( input_item [ 'input' ] + predicted_text )[ 0 ][ 0 ][ 'score' ] * 10
return reward
observaton_list = [{ 'input' : 'i think dogecoin is' }]
env = MyRLEnv ( model , tokenizer , observation_input = observaton_list , compare_sample = 1 )
actor = TextRLActor ( env , model , tokenizer , optimizer = 'adamw' ,
temperature = 0.8 ,
top_k = 100 ,
top_p = 0.85 ,)
agent = actor . agent_ppo ( update_interval = 50 , minibatch_size = 3 , epochs = 10 , lr = 3e-4 )
print ( actor . predict ( observaton_list [ 0 ]))
pfrl . experiments . train_agent_with_evaluation (
agent ,
env ,
steps = 3000 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
train_max_episode_len = 100 ,
eval_interval = 10 ,
outdir = 'checkpoint' ,
)
agent . load ( "./checkpoint/best" )
print ( actor . predict ( observaton_list [ 0 ]))bigscience/bloomz-7b1-mt import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))꽃잎 용량을 늘리기 위해 공공 떼에 기여하는 것이 좋습니다.
https://github.com/bigscience-workshop/petals
pip install petals -U
import pfrl
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import BloomTokenizerFast
from petals import DistributedBloomForCausalLM
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
MODEL_NAME = "bigscience/bloom-petals"
tokenizer = BloomTokenizerFast . from_pretrained ( MODEL_NAME )
model = DistributedBloomForCausalLM . from_pretrained ( MODEL_NAME )
model = model . cuda ()
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ): # predicted will be the list of predicted token
reward = [ 0 ]
if finish :
reward = [ 1 ] # calculate reward score base on predicted_list
return reward
observaton_list = [{ "input" : "explain how attention work in seq2seq model" }]
env = TextRLEnv ( model , tokenizer , observation_input = observaton_list , max_length = 20 , compare_sample = 2 )
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False ,
temperature = 1.0 ,
top_k = 0 ,
top_p = 1.0 )
agent = actor . agent_ppo ( update_interval = 2 , minibatch_size = 2 , epochs = 10 )
print ( actor . predict ( observaton_list [ 0 ]))
train_agent_with_evaluation (
agent ,
env ,
steps = 100 ,
eval_n_steps = None ,
eval_n_episodes = 1 ,
eval_interval = 2 ,
outdir = 'bloom—test' ,
)
print ( actor . predict ( observaton_list [ 0 ]))[Elon Musk가 Doge에 대해 말할 수 있도록 RL을 통한 제어 가능한 생성] (https://github.com/voidful/textrl/blob/main/example/2022-12-10-textrl-elon-musk.ipynb)
Colab 예 : BigScience/Bloom-560m
Colab Exmaple : Huggingtweets/Elonmusk
이전 : i think dogecoin is a great idea.
후 : i think dogecoin is a great idea, but I think it is a little overused.
pip install pfrl@git+https://github.com/voidful/pfrl.git
pip install textrl이 프로젝트에 대한 클론과 CD.
pip install -e . import torch
from textrl import TextRLEnv , TextRLActor , train_agent_with_evaluation
from transformers import AutoModelForCausalLM , AutoTokenizer
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer . from_pretrained ( checkpoint )
model = AutoModelForCausalLM . from_pretrained ( checkpoint , torch_dtype = "auto" , device_map = "auto" )
model = model . cuda () class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ):
if finish :
reward = [ 0 ] # calculate reward score based on predicted_list
return rewardObservation_list는 모델 교육을위한 가능한 모든 입력 문자열의 목록이어야합니다.
예 : observation_list = [{"input":'testing sent 1'},{"input":'testing sent 2'}]
env = MyRLEnv ( model , tokenizer , observation_input = observation_list )
actor = TextRLActor ( env , model , tokenizer )
agent = actor . agent_ppo ( update_interval = 10 , minibatch_size = 2000 , epochs = 20 ) n_episodes = 1000
max_episode_len = 200 # max sentence length
for i in range ( 1 , n_episodes + 1 ):
obs = env . reset ()
R = 0
t = 0
while True :
action = agent . act ( obs )
obs , reward , done , pred = env . step ( action )
R += reward
t += 1
reset = t == max_episode_len
agent . observe ( obs , reward , done , reset )
if done or reset :
break
if i % 10 == 0 :
print ( 'episode:' , i , 'R:' , R )
if i % 50 == 0 :
print ( 'statistics:' , agent . get_statistics ())
print ( 'Finished.' )훈련하는 또 다른 방법 :
import logging
import sys
logging . basicConfig ( level = logging . INFO , stream = sys . stdout , format = '' )
train_agent_with_evaluation (
agent ,
env ,
steps = 1000 ,
eval_n_steps = None ,
eval_n_episodes = 1500 ,
train_max_episode_len = 50 ,
eval_interval = 10000 ,
outdir = 'somewhere' ,
) agent . load ( "somewhere/best" ) # loading the best model
actor . predict ( "input text" ) 이 업데이트 된 사용법 섹션은 에이전트와 환경을 초기화하고 환경에 대한 보상 기능을 설정하고 교육 준비, 모델 훈련 및 예측 방법에 대한 포괄적 인 안내서를 제공합니다. 또한 train_agent_with_evaluation 기능을 사용하여 모델을 교육하는 대체 방법도 포함되어 있습니다.
textrl-dump --model ./model_path_before_rl --rl ./rl_path --dump ./output_dirRL을 사용하여 언어 모델을 정합하려면 보상 기능을 수정해야합니다.
from textrl import TextRLEnv
class MyRLEnv ( TextRLEnv ):
def get_reward ( self , input_item , predicted_list , finish ):
# input_item is the prompt input for the model, it will be one of your observation
# an observation will be a list of sentence of eg: ['inputted sentence','xxx','yyy']
# only the first input will feed to the model 'inputted sentence', and
# the remaining can be the reference for reward calculation
# predicted_list is the list of predicted sentences of RL model generated,
# it will be used for ranking reward calculation
# finish is the end of sentences flags, get_reward will be called during generating each word, and
# when finish is True, it means the sentence is finished, it will use for sentence level reward calculation.
# reward should be the list equal to the length of predicted_list
return reward다양한 예제 샘플링을위한 매개 변수 :
actor = TextRLActor ( env , model , tokenizer ,
act_deterministically = False , # select the max probability token for each step or not
temperature = 1 , # temperature for sampling
compare_sample = 2 , # num of sample to rank
top_k = 0 , # top k sampling
top_p = 1.0 ,) # top p sampling강화 학습 (RL) 모델을 교육 할 때 최적의 성능을 보장하기 위해 몇 가지 주요 매개 변수를 조정해야합니다. 다음은 중요한 매개 변수와 그 설명 목록입니다.
update_interval = 10 minibatch_size = 2000 epochs = 20 gamma = 0.99 lr = 1e-4 epsilon = 0.2 entropy_coef = 0.01 steps = 1000 eval_interval = 10000 train_max_episode_len = 50이러한 매개 변수는 최상의 성능을 달성하기 위해 특정 문제와 환경에 따라 신중하게 조정해야합니다. 일반적으로 기본값으로 시작한 다음 관찰 된 학습 행동에 따라 조정하는 것이 좋습니다.