gpn
0.6

Code und Ressourcen aus GPN-Papier und GPN-MSA-Papier.
pip install git+https://github.com/songlab-cal/gpn.git import gpn . model
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM . from_pretrained ( "songlab/gpn-brassicales" )
# or
model = AutoModelForMaskedLM . from_pretrained ( "songlab/gpn-msa-sapiens" )Kann auch als GPN-SS (Einzelsequenz) bezeichnet werden.
examples/ss/basic_example.ipynbconvnet (Standard), roformer (Transformator), bytenet--config_overrides encoder=bytenet,num_hidden_layers=30 an: z.WANDB_PROJECT=your_project torchrun --nproc_per_node= $( echo $CUDA_VISIBLE_DEVICES | awk -F ' , ' ' {print NF} ' ) -m gpn.ss.run_mlm --do_train --do_eval
--report_to wandb --prediction_loss_only True --remove_unused_columns False
--dataset_name results/dataset --tokenizer_name gonzalobenegas/tokenizer-dna-mlm
--soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0
--weight_decay 0.01 --optim adamw_torch
--dataloader_num_workers 16 --seed 42
--save_strategy steps --save_steps 10000 --evaluation_strategy steps
--eval_steps 10000 --logging_steps 10000 --max_steps 120000 --warmup_steps 1000
--learning_rate 1e-3 --lr_scheduler_type constant_with_warmup
--run_name your_run --output_dir your_output_dir --model_type GPN
--per_device_train_batch_size 512 --per_device_eval_batch_size 512 --gradient_accumulation_steps 1 --total_batch_size 2048
--torch_compile
--ddp_find_unused_parameters False
--bf16 --bf16_full_eval chrom , start , endtorchrun --nproc_per_node= $( echo $CUDA_VISIBLE_DEVICES | awk -F ' , ' ' {print NF} ' ) -m gpn.ss.get_embeddings windows.parquet genome.fa.gz 100 your_output_dir
results.parquet --per_device_batch_size 4000 --is_file --dataloader_num_workers 16chrom , pos , ref , alttorchrun --nproc_per_node= $( echo $CUDA_VISIBLE_DEVICES | awk -F ' , ' ' {print NF} ' ) -m gpn.ss.run_vep variants.parquet genome.fa.gz 512 your_output_dir results.parquet
--per_device_batch-size 4000 --is_file --dataloader_num_workers 16examples/msa/basic_example.ipynbexamples/msa/vep.ipynbexamples/msa/training.ipynbGPN:
@article { benegas2023dna ,
author = { Gonzalo Benegas and Sanjit Singh Batra and Yun S. Song } ,
title = { DNA language models are powerful predictors of genome-wide variant effects } ,
journal = { Proceedings of the National Academy of Sciences } ,
volume = { 120 } ,
number = { 44 } ,
pages = { e2311219120 } ,
year = { 2023 } ,
doi = { 10.1073/pnas.2311219120 } ,
URL = { https://www.pnas.org/doi/abs/10.1073/pnas.2311219120 } ,
eprint = { https://www.pnas.org/doi/pdf/10.1073/pnas.2311219120 } ,
}GPN-MSA:
@article { benegas2023gpnmsa ,
author = { Gonzalo Benegas and Carlos Albors and Alan J. Aw and Chengzhong Ye and Yun S. Song } ,
title = { GPN-MSA: an alignment-based DNA language model for genome-wide variant effect prediction } ,
elocation-id = { 2023.10.10.561776 } ,
year = { 2023 } ,
doi = { 10.1101/2023.10.10.561776 } ,
publisher = { Cold Spring Harbor Laboratory } ,
URL = { https://www.biorxiv.org/content/early/2023/10/11/2023.10.10.561776 } ,
eprint = { https://www.biorxiv.org/content/early/2023/10/11/2023.10.10.561776.full.pdf } ,
journal = { bioRxiv }
}