ดาวน์โหลด xlstm - ดาวน์โหลดซอร์สโค้ด xlstm

xlstm

ซอร์สโค้ดอื่น ๆ

1.0.0

ดาวน์โหลด

XLSTM: หน่วยความจำระยะสั้นยาวขยาย

กระดาษ: https://arxiv.org/abs/2405.04517

เกี่ยวกับ

XLSTM เป็นสถาปัตยกรรมเครือข่ายประสาทที่เกิดขึ้นใหม่ตามแนวคิดของ LSTM ดั้งเดิม ผ่านการ gating แบบเอ็กซ์โปเนนเชียลด้วยเทคนิคการทำให้เป็นมาตรฐานและการรักษาเสถียรภาพที่เหมาะสมและหน่วยความจำเมทริกซ์ใหม่มันจะเอาชนะข้อ จำกัด ของ LSTM ดั้งเดิมและแสดงประสิทธิภาพที่มีแนวโน้มในการสร้างแบบจำลองภาษาเมื่อเปรียบเทียบกับโมเดลหม้อแปลงหรือพื้นที่ของรัฐ

XLSTM ขนาดใหญ่ 7B

เราฝึกอบรมโมเดลภาษา XLSTM 7B

เราได้ปรับสถาปัตยกรรม XLSTM ให้เหมาะสมในแง่ของการฝึกอบรมและเสถียรภาพ รหัสสำหรับสถาปัตยกรรมที่อัปเดตอยู่ใน xlstm/xlstm_large

น้ำหนักของรุ่นมีอยู่ใน HuggingFace ที่ https://huggingface.co/nx-ai/xlstm-7b

การติดตั้งน้อยที่สุด

สร้างสภาพแวดล้อม conda จากไฟล์ environment_pt220cu121.yaml ติดตั้งรหัสรุ่นเท่านั้น (เช่นโมดูล xlstm ) เป็นแพ็คเกจ:

ติดตั้งผ่าน PIP:

pip install xlstm

โคลนจาก GitHub:

git clone https://github.com/NX-AI/xlstm.git
cd xlstm
pip install -e .

สำหรับการใช้รุ่น 7B XLSTM ติดตั้ง mlstm_kernels ผ่าน:

 pip install mlstm_kernels

ความต้องการ

แพ็คเกจนี้ขึ้นอยู่กับ pytorch และได้รับการทดสอบสำหรับรุ่น >=1.8 สำหรับรุ่น CUDA ของ SLSTM คุณต้องใช้ความสามารถในการคำนวณ> = 8.0 ดู https://developer.nvidia.com/cuda-gpus สำหรับสภาพแวดล้อมที่ผ่านการทดสอบมาอย่างดีให้ติดตั้ง environment_pt220cu121.yaml เป็น:

conda env create -n xlstm -f environment_pt220cu121.yaml
conda activate xlstm

สำหรับรุ่น XLSTM ขนาดใหญ่ 7B เราต้องการแพ็คเกจ mlstm_kernels ของเรา (TODO เพิ่ม GitHub Link) ซึ่งให้เมล็ดที่รวดเร็วสำหรับ XLSTM

รุ่นจากกระดาษ xlstm

ส่วนนี้อธิบายวิธีการใช้โมเดลจากกระดาษ XLSTM

การใช้งาน

สำหรับแอปพลิเคชันที่ไม่ใช่ภาษาหรือเพื่อรวมเข้ากับสถาปัตยกรรมอื่น ๆ คุณสามารถใช้ xLSTMBlockStack และสำหรับการสร้างแบบจำลองภาษาหรือแอปพลิเคชันที่ใช้โทเค็นอื่น ๆ คุณสามารถใช้ xLSTMLMModel

XLSTM บล็อกสแต็ก

xLSTMBLockStack มีไว้สำหรับใช้เป็นกระดูกสันหลังทางเลือกในโครงการที่มีอยู่ มันคล้ายกับสแต็กของบล็อกหม้อแปลง แต่ใช้บล็อก XLSTM:

 import torch

from xlstm import (
    xLSTMBlockStack ,
    xLSTMBlockStackConfig ,
    mLSTMBlockConfig ,
    mLSTMLayerConfig ,
    sLSTMBlockConfig ,
    sLSTMLayerConfig ,
    FeedForwardConfig ,
)

cfg = xLSTMBlockStackConfig (
    mlstm_block = mLSTMBlockConfig (
        mlstm = mLSTMLayerConfig (
            conv1d_kernel_size = 4 , qkv_proj_blocksize = 4 , num_heads = 4
        )
    ),
    slstm_block = sLSTMBlockConfig (
        slstm = sLSTMLayerConfig (
            backend = "cuda" ,
            num_heads = 4 ,
            conv1d_kernel_size = 4 ,
            bias_init = "powerlaw_blockdependent" ,
        ),
        feedforward = FeedForwardConfig ( proj_factor = 1.3 , act_fn = "gelu" ),
    ),
    context_length = 256 ,
    num_blocks = 7 ,
    embedding_dim = 128 ,
    slstm_at = [ 1 ],

)

xlstm_stack = xLSTMBlockStack ( cfg )

x = torch . randn ( 4 , 256 , 128 ). to ( "cuda" )
xlstm_stack = xlstm_stack . to ( "cuda" )
y = xlstm_stack ( x )
y . shape == x . shape

หากคุณกำลังทำงานกับสตริง / ไฟล์ Yaml สำหรับการกำหนดค่าคุณสามารถใช้ Dacite เพื่อสร้าง dataclasses config นี่เหมือนกับตัวอย่างด้านบน:

 from omegaconf import OmegaConf
from dacite import from_dict
from dacite import Config as DaciteConfig
from xlstm import xLSTMBlockStack , xLSTMBlockStackConfig

xlstm_cfg = """ 
mlstm_block:
  mlstm:
    conv1d_kernel_size: 4
    qkv_proj_blocksize: 4
    num_heads: 4
slstm_block:
  slstm:
    backend: cuda
    num_heads: 4
    conv1d_kernel_size: 4
    bias_init: powerlaw_blockdependent
  feedforward:
    proj_factor: 1.3
    act_fn: gelu
context_length: 256
num_blocks: 7
embedding_dim: 128
slstm_at: [1]
"""
cfg = OmegaConf . create ( xlstm_cfg )
cfg = from_dict ( data_class = xLSTMBlockStackConfig , data = OmegaConf . to_container ( cfg ), config = DaciteConfig ( strict = True ))
xlstm_stack = xLSTMBlockStack ( cfg )

x = torch . randn ( 4 , 256 , 128 ). to ( "cuda" )
xlstm_stack = xlstm_stack . to ( "cuda" )
y = xlstm_stack ( x )
y . shape == x . shape

โมเดลภาษา XLSTM

xLSTMLMModel เป็นเสื้อคลุมรอบ xLSTMBlockStack ที่เพิ่มการฝังโทเค็นและหัว LM

 from omegaconf import OmegaConf
from dacite import from_dict
from dacite import Config as DaciteConfig
from xlstm import xLSTMLMModel , xLSTMLMModelConfig

xlstm_cfg = """ 
vocab_size: 50304
mlstm_block:
  mlstm:
    conv1d_kernel_size: 4
    qkv_proj_blocksize: 4
    num_heads: 4
slstm_block:
  slstm:
    backend: cuda
    num_heads: 4
    conv1d_kernel_size: 4
    bias_init: powerlaw_blockdependent
  feedforward:
    proj_factor: 1.3
    act_fn: gelu
context_length: 256
num_blocks: 7
embedding_dim: 128
slstm_at: [1]
"""
cfg = OmegaConf . create ( xlstm_cfg )
cfg = from_dict ( data_class = xLSTMLMModelConfig , data = OmegaConf . to_container ( cfg ), config = DaciteConfig ( strict = True ))
xlstm_stack = xLSTMLMModel ( cfg )

x = torch . randint ( 0 , 50304 , size = ( 4 , 256 )). to ( "cuda" )
xlstm_stack = xlstm_stack . to ( "cuda" )
y = xlstm_stack ( x )
y . shape [ 1 :] == ( 256 , 50304 )

การทดลอง

การทดลองสังเคราะห์แสดงให้เห็นถึงประโยชน์ของ SLSTM ผ่าน MLSTM และในทางกลับกันดีที่สุดคืองานที่เท่าเทียมกันและงานเรียกคืนแบบเชื่อมโยงหลายแบบ ภารกิจที่เท่าเทียมกันสามารถแก้ไขได้ด้วยความสามารถในการติดตามสถานะโดยการผสมหน่วยความจำของ SLSTM งานการเรียกคืนแบบเชื่อมโยงหลายแบบวัดความสามารถในการท่องจำซึ่งการขยายตัวของเมทริกซ์เมมรีและรัฐ MLSTM นั้นมีประโยชน์มาก ร่วมกันพวกเขาทำได้ดีในงานทั้งสอง

ในการเรียกใช้แต่ละรายการให้เรียกใช้ main.py ในโฟลเดอร์การทดลองเช่น:

 python experiments/main.py --config experiments/parity_xLSTM01.yaml   # xLSTM[0:1], sLSTM only
python experiments/main.py --config experiments/parity_xLSTM10.yaml   # xLSTM[1:0], mLSTM only
python experiments/main.py --config experiments/parity_xLSTM11.yaml   # xLSTM[1:1], mLSTM and sLSTM

โปรดทราบว่าลูปการฝึกอบรมไม่มีการหยุดการหยุดหรือทดสอบก่อน

การอ้างอิง

หากคุณใช้ codebase นี้หรือหางานที่มีค่าของเราโปรดอ้างอิงกระดาษ XLSTM:

 @inproceedings{beck:24xlstm,
      title={xLSTM: Extended Long Short-Term Memory}, 
      author={Maximilian Beck and Korbinian Pöppel and Markus Spanring and Andreas Auer and Oleksandra Prudnikova and Michael Kopp and Günter Klambauer and Johannes Brandstetter and Sepp Hochreiter},
      booktitle = {Thirty-eighth Conference on Neural Information Processing Systems},
      year={2024},
      url={https://arxiv.org/abs/2405.04517}, 
}

ขยาย

ข้อมูลเพิ่มเติม

เวอร์ชัน 1.0.0
ประเภท ซอร์สโค้ดอื่น ๆ
เวลาอัปเดต 2025-04-17
ขนาด 222.32KB
มาจาก Github

แอปที่เกี่ยวข้อง

Google Dorks

2025-03-10
shepherd

2025-06-04
mongo express

2025-06-04
hidusbf

2025-02-14
Free Algorithms Books

2025-05-29
markdownpedia

2025-04-22

แนะนำสำหรับคุณ

chat.petals.dev

ซอร์สโค้ดอื่น ๆ

1.0.0
GPT Prompt Templates

ซอร์สโค้ดอื่น ๆ

1.0.0
GPTyped

ซอร์สโค้ดอื่น ๆ

GPTyped 1.0.5
Google Dorks

ซอร์สโค้ดอื่น ๆ

1.0
shepherd

ซอร์สโค้ดอื่น ๆ

v6.1.6-react-shepherd: Prepare Release (#3063)
mongo express

ซอร์สโค้ดอื่น ๆ

v1.1.0-rc-3
Google Dorks

ซอร์สโค้ดอื่น ๆ

1.0
shepherd

ซอร์สโค้ดอื่น ๆ

v6.1.6-react-shepherd: Prepare Release (#3063)
mongo express

ซอร์สโค้ดอื่น ๆ

v1.1.0-rc-3

ข้อมูลที่เกี่ยวข้อง ทั้งหมด