ดาวน์โหลด tfrecord - ดาวน์โหลดซอร์สโค้ด tfrecord

ผู้อ่านและนักเขียน Tfrecord

ไลบรารีนี้อนุญาตให้อ่านและเขียนไฟล์ TFRECORD ได้อย่างมีประสิทธิภาพใน Python ไลบรารียังมีเครื่องอ่าน iterabledataset ของไฟล์ tfrecord สำหรับ pytorch รองรับ GZIP Tfrecords ที่ไม่ได้บีบอัดและบีบอัดในปัจจุบัน

การติดตั้ง

 pip3 install 'tfrecord[torch]'

การใช้งาน

ขอแนะนำให้สร้างไฟล์ดัชนีสำหรับแต่ละไฟล์ tfrecord ต้องระบุไฟล์ดัชนีเมื่อใช้คนงานหลายคนมิฉะนั้นตัวโหลดอาจส่งคืนระเบียนที่ซ้ำกัน คุณสามารถสร้างไฟล์ดัชนีสำหรับไฟล์ TFRECORD แต่ละไฟล์ด้วยโปรแกรมยูทิลิตี้นี้:

 python3 -m tfrecord.tools.tfrecord2idx <tfrecord path> <index path>

ในการสร้าง ไฟล์ ".tfidnex" สำหรับไฟล์ทั้งหมด " .tfrecord" ในไดเรกทอรีรัน:

 tfrecord2idx <data dir>

การอ่านและการเขียน tf.train. ตัวอย่าง

การอ่านบันทึก tf.example ใน pytorch

ใช้ tfrecordDataSet เพื่ออ่านไฟล์ tfrecord ใน pytorch

 import torch
from tfrecord . torch . dataset import TFRecordDataset

tfrecord_path = "/tmp/data.tfrecord"
index_path = None
description = { "image" : "byte" , "label" : "float" }
dataset = TFRecordDataset ( tfrecord_path , index_path , description )
loader = torch . utils . data . DataLoader ( dataset , batch_size = 32 )

data = next ( iter ( loader ))
print ( data )

ใช้ MultitFrecordDataSet เพื่ออ่านไฟล์ TFRECORD หลายไฟล์ ตัวอย่างคลาสนี้จากไฟล์ TFRECORD ที่กำหนดด้วยความน่าจะเป็นที่กำหนด

 import torch
from tfrecord . torch . dataset import MultiTFRecordDataset

tfrecord_pattern = "/tmp/{}.tfrecord"
index_pattern = "/tmp/{}.index"
splits = {
    "dataset1" : 0.8 ,
    "dataset2" : 0.2 ,
}
description = { "image" : "byte" , "label" : "int" }
dataset = MultiTFRecordDataset ( tfrecord_pattern , index_pattern , splits , description )
loader = torch . utils . data . DataLoader ( dataset , batch_size = 32 )

data = next ( iter ( loader ))
print ( data )

ชุดข้อมูล pytorch ที่ไม่มีที่สิ้นสุดและ จำกัด

โดยค่าเริ่มต้น MultiTFRecordDataset นั้นไม่มีที่สิ้นสุดซึ่งหมายความว่ามันสุ่มตัวอย่างข้อมูลตลอดไป คุณสามารถทำให้มัน จำกัด ด้วยการจัดหาธงที่เหมาะสม

 dataset = MultiTFRecordDataset(..., infinite=False)

การสับข้อมูล

ทั้ง TFRECORDDATASET และ MULTITFRECORDDATASET จะสลัดข้อมูลโดยอัตโนมัติเมื่อคุณให้ขนาดคิว

 dataset = TFRecordDataset(..., shuffle_queue_size=1024)

การแปลงข้อมูลอินพุต

คุณสามารถเลือกฟังก์ชั่นเป็นอาร์กิวเมนต์ transform เพื่อทำการประมวลผลการโพสต์ของคุณสมบัติก่อนที่จะกลับมา ตัวอย่างเช่นนี้สามารถใช้ในการถอดรหัสภาพหรือทำให้สีปกติเป็นช่วงความยาวหรือตัวแปรของแผ่น

 import tfrecord
import cv2

def decode_image ( features ):
    # get BGR image from bytes
    features [ "image" ] = cv2 . imdecode ( features [ "image" ], - 1 )
    return features


description = {
    "image" : "bytes" ,
}

dataset = tfrecord . torch . TFRecordDataset ( "/tmp/data.tfrecord" ,
                                         index_path = None ,
                                         description = description ,
                                         transform = decode_image )

data = next ( iter ( dataset ))
print ( data )

เขียนบันทึก tf.example ใน Python

 import tfrecord

writer = tfrecord . TFRecordWriter ( "/tmp/data.tfrecord" )
writer . write ({
    "image" : ( image_bytes , "byte" ),
    "label" : ( label , "float" ),
    "index" : ( index , "int" )
})
writer . close ()

อ่านบันทึก tf.example ใน Python

 import tfrecord

loader = tfrecord . tfrecord_loader ( "/tmp/data.tfrecord" , None , {
    "image" : "byte" ,
    "label" : "float" ,
    "index" : "int"
})
for record in loader :
    print ( record [ "label" ])

การอ่านและการเขียน tf.train.equenceexample

SequenceExamples สามารถอ่านและเขียนได้โดยใช้วิธีการเดียวกันที่แสดงด้านบนด้วยอาร์กิวเมนต์พิเศษ ( sequence_description สำหรับการอ่านและ sequence_datum สำหรับการเขียน) ซึ่งทำให้ฟังก์ชั่นการอ่าน/เขียนตามลำดับเพื่อปฏิบัติต่อข้อมูลเป็น sequenceExample

การเขียน SequenceExamples ไปยังไฟล์

 import tfrecord

writer = tfrecord . TFRecordWriter ( "/tmp/data.tfrecord" )
writer . write ({ 'length' : ( 3 , 'int' ), 'label' : ( 1 , 'int' )},
             { 'tokens' : ([[ 0 , 0 , 1 ], [ 0 , 1 , 0 ], [ 1 , 0 , 0 ]], 'int' ), 'seq_labels' : ([ 0 , 1 , 1 ], 'int' )})
writer . write ({ 'length' : ( 3 , 'int' ), 'label' : ( 1 , 'int' )},
             { 'tokens' : ([[ 0 , 0 , 1 ], [ 1 , 0 , 0 ]], 'int' ), 'seq_labels' : ([ 0 , 1 ], 'int' )})
writer . close ()

การอ่าน sequenceExamples ใน Python

การอ่านจาก sequenceexample yeilds tuple ที่มีสององค์ประกอบ

 import tfrecord

context_description = { "length" : "int" , "label" : "int" }
sequence_description = { "tokens" : "int" , "seq_labels" : "int" }
loader = tfrecord . tfrecord_loader ( "/tmp/data.tfrecord" , None ,
                                  context_description ,
                                  sequence_description = sequence_description )

for context , sequence_feats in loader :
    print ( context [ "label" ])
    print ( sequence_feats [ "seq_labels" ])

อ่าน sequenceExamples ใน pytorch

ตามที่อธิบายไว้ในส่วนเกี่ยวกับ Transforming Input หนึ่งสามารถผ่านฟังก์ชั่นเป็นอาร์กิวเมนต์ transform เพื่อดำเนินการโพสต์การประมวลผลของคุณสมบัติ สิ่งนี้ควรใช้โดยเฉพาะอย่างยิ่งสำหรับคุณสมบัติลำดับเนื่องจากเป็นลำดับความยาวตัวแปรและจำเป็นต้องได้รับการบุด้วยก่อนที่จะถูกแบตช์

 import torch
import numpy as np
from tfrecord . torch . dataset import TFRecordDataset

PAD_WIDTH = 5
def pad_sequence_feats ( data ):
    context , features = data
    for k , v in features . items ():
        features [ k ] = np . pad ( v , (( 0 , PAD_WIDTH - len ( v )), ( 0 , 0 )), 'constant' )
    return ( context , features )

context_description = { "length" : "int" , "label" : "int" }
sequence_description = { "tokens" : "int " , "seq_labels" : "int" }
dataset = TFRecordDataset ( "/tmp/data.tfrecord" ,
                          index_path = None ,
			  description = context_description ,
			  transform = pad_sequence_feats ,
			  sequence_description = sequence_description )
loader = torch . utils . data . DataLoader ( dataset , batch_size = 32 )
data = next ( iter ( loader ))
print ( data )

หรือคุณสามารถเลือกที่จะใช้ collate_fn ที่กำหนดเองเพื่อประกอบแบทช์ตัวอย่างเช่นเพื่อดำเนินการเสริมไดนามิก

 import torch
import numpy as np
from tfrecord . torch . dataset import TFRecordDataset

def collate_fn ( batch ):
    from torch . utils . data . _utils import collate
    from torch . nn . utils import rnn
    context , feats = zip ( * batch )
    feats_ = { k : [ torch . Tensor ( d [ k ]) for d in feats ] for k in feats [ 0 ]}
    return ( collate . default_collate ( context ),
            { k : rnn . pad_sequence ( f , True ) for ( k , f ) in feats_ . items ()})

context_description = { "length" : "int" , "label" : "int" }
sequence_description = { "tokens" : "int " , "seq_labels" : "int" }
dataset = TFRecordDataset ( "/tmp/data.tfrecord" ,
                          index_path = None ,
			  description = context_description ,
			  transform = pad_sequence_feats ,
			  sequence_description = sequence_description )
loader = torch . utils . data . DataLoader ( dataset , batch_size = 32 , collate_fn = collate_fn )
data = next ( iter ( loader ))
print ( data )