
손에 서버가없는 벡터 데이터베이스.
flechasdb 패키지는 Rust로 작성된 FlechasDB 시스템의 핵심 라이브러리입니다.
FlechasDB 시스템은 서버리스 환경에 완벽하게 맞는 벡터 데이터베이스가되는 것을 목표로합니다. FlechasDB 시스템의 신조는 간단합니다. 전용 서버를 지속적으로 실행할 필요가 없습니다 .
*: 다른 패키지에서 제공 flechasdb-s3 .
아직 상자가 출판되지 않았습니다. Cargo.toml 파일에 다음 줄을 추가하십시오.
[ dependencies ]
flechasdb = { git = " https://github.com/codemonger-io/flechasdb.git " }다음은 무작위로 생성 된 벡터에서 벡터 데이터베이스를 구축하는 것이 있습니다.
use rand :: Rng ;
use flechasdb :: db :: build :: {
DatabaseBuilder ,
proto :: serialize_database ,
} ;
use flechasdb :: io :: LocalFileSystem ;
use flechasdb :: vector :: BlockVectorSet ;
fn main ( ) {
const M : usize = 100000 ; // number of vectors
const N : usize = 1536 ; // vector size
const D : usize = 12 ; // number of subvector divisions
const P : usize = 100 ; // number of partitions
const C : usize = 256 ; // number of clusters for product quantization
let time = std :: time :: Instant :: now ( ) ;
let mut data : Vec < f32 > = Vec :: with_capacity ( M * N ) ;
unsafe { data . set_len ( M * N ) ; }
let mut rng = rand :: thread_rng ( ) ;
rng . fill ( & mut data [ .. ] ) ;
let vs = BlockVectorSet :: chunk ( data , N . try_into ( ) . unwrap ( ) ) . unwrap ( ) ;
println ! ( "prepared data in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let time = std :: time :: Instant :: now ( ) ;
let mut db = DatabaseBuilder :: new ( vs )
. with_partitions ( P . try_into ( ) . unwrap ( ) )
. with_divisions ( D . try_into ( ) . unwrap ( ) )
. with_clusters ( C . try_into ( ) . unwrap ( ) )
. build ( )
. unwrap ( ) ;
println ! ( "built database in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
for i in 0 .. M {
db . set_attribute_at ( i , ( "datum_id" , i as u64 ) ) . unwrap ( ) ;
}
let time = std :: time :: Instant :: now ( ) ;
serialize_database ( & db , & mut LocalFileSystem :: new ( "testdb" ) ) . unwrap ( ) ;
println ! ( "serialized database in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
} examples/build-random 폴더에서 전체 예제를 찾을 수 있습니다.
참고 : 내 기계에서 시간이 걸렸습니다 (Apple M1 Pro, 32GB RAM, 1TB SSD).
prepared data in 0.9123601 s
built database in 906.51526 s
serialized database in 0.14329213 s
다음은 벡터 데이터베이스를로드하고 K-NEAREST 이웃 (K-NN)을 위해 무작위로 생성 된 벡터를 쿼리하는 예입니다.
use rand :: Rng ;
use std :: env :: args ;
use std :: path :: Path ;
use flechasdb :: db :: stored :: { Database , LoadDatabase } ;
use flechasdb :: io :: LocalFileSystem ;
fn main ( ) {
const K : usize = 10 ; // k-nearest neighbors
const NPROBE : usize = 5 ; // number of partitions to query
let time = std :: time :: Instant :: now ( ) ;
let db_path = args ( ) . nth ( 1 ) . expect ( "no db path given" ) ;
let db_path = Path :: new ( & db_path ) ;
let db = Database :: < f32 , _ > :: load_database (
LocalFileSystem :: new ( db_path . parent ( ) . unwrap ( ) ) ,
db_path . file_name ( ) . unwrap ( ) . to_str ( ) . unwrap ( ) ,
) . unwrap ( ) ;
println ! ( "loaded database in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let mut qv : Vec < f32 > = Vec :: with_capacity ( db . vector_size ( ) ) ;
unsafe { qv . set_len ( db . vector_size ( ) ) ; }
let mut rng = rand :: thread_rng ( ) ;
rng . fill ( & mut qv [ .. ] ) ;
for r in 0 .. 2 { // second round should run faster
let time = std :: time :: Instant :: now ( ) ;
let results = db . query (
& qv ,
K . try_into ( ) . unwrap ( ) ,
NPROBE . try_into ( ) . unwrap ( ) ,
) . unwrap ( ) ;
println ! ( "[{}] queried k-NN in {} s" , r , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let time = std :: time :: Instant :: now ( ) ;
for ( i , result ) in results . into_iter ( ) . enumerate ( ) {
// getting attributes will incur additional disk reads
let attr = result . get_attribute ( "datum_id" ) . unwrap ( ) ;
println ! (
" t {}: partition={}, approx. distance²={}, datum_id={:?}" ,
i ,
result . partition_index ,
result . squared_distance ,
attr ,
) ;
}
println ! (
"[{}] printed results in {} s" ,
r ,
time . elapsed ( ) . as_secs_f32 ( ) ,
) ;
}
} examples/query-sync 폴더에서 전체 예제를 찾을 수 있습니다.
참고 : 내 컴퓨터의 출력 (Apple M1 Pro, 32GB RAM, 1TB SSD) :
loaded database in 0.000142083 s
[0] queried k-NN in 0.0078015 s
0: partition=95, approx. distance²=126.23533, datum_id=Some(Uint64(90884))
1: partition=29, approx. distance²=127.76597, datum_id=Some(Uint64(30864))
2: partition=95, approx. distance²=127.80611, datum_id=Some(Uint64(75236))
3: partition=56, approx. distance²=127.808174, datum_id=Some(Uint64(27890))
4: partition=25, approx. distance²=127.85459, datum_id=Some(Uint64(16417))
5: partition=95, approx. distance²=127.977425, datum_id=Some(Uint64(70910))
6: partition=25, approx. distance²=128.06209, datum_id=Some(Uint64(3237))
7: partition=95, approx. distance²=128.22603, datum_id=Some(Uint64(41942))
8: partition=79, approx. distance²=128.26906, datum_id=Some(Uint64(89799))
9: partition=25, approx. distance²=128.27995, datum_id=Some(Uint64(6593))
[0] printed results in 0.003392833 s
[1] queried k-NN in 0.001475625 s
0: partition=95, approx. distance²=126.23533, datum_id=Some(Uint64(90884))
1: partition=29, approx. distance²=127.76597, datum_id=Some(Uint64(30864))
2: partition=95, approx. distance²=127.80611, datum_id=Some(Uint64(75236))
3: partition=56, approx. distance²=127.808174, datum_id=Some(Uint64(27890))
4: partition=25, approx. distance²=127.85459, datum_id=Some(Uint64(16417))
5: partition=95, approx. distance²=127.977425, datum_id=Some(Uint64(70910))
6: partition=25, approx. distance²=128.06209, datum_id=Some(Uint64(3237))
7: partition=95, approx. distance²=128.22603, datum_id=Some(Uint64(41942))
8: partition=79, approx. distance²=128.26906, datum_id=Some(Uint64(89799))
9: partition=25, approx. distance²=128.27995, datum_id=Some(Uint64(6593))
[1] printed results in 0.0000215 s
다음은 벡터 데이터베이스를 비동기로로드하고 K-NN에 대해 무작위로 생성 된 벡터를 쿼리하는 예입니다.
use rand :: Rng ;
use std :: env :: args ;
use std :: path :: Path ;
use flechasdb :: asyncdb :: io :: LocalFileSystem ;
use flechasdb :: asyncdb :: stored :: { Database , LoadDatabase } ;
# [ tokio :: main ]
async fn main ( ) {
const K : usize = 10 ; // k-nearest neighbors
const NPROBE : usize = 5 ; // number of partitions to search
let time = std :: time :: Instant :: now ( ) ;
let db_path = args ( ) . nth ( 1 ) . expect ( "missing db path" ) ;
let db_path = Path :: new ( & db_path ) ;
let db = Database :: < f32 , _ > :: load_database (
LocalFileSystem :: new ( db_path . parent ( ) . unwrap ( ) ) ,
db_path . file_name ( ) . unwrap ( ) . to_str ( ) . unwrap ( ) ,
) . await . unwrap ( ) ;
println ! ( "loaded database in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let mut qv = Vec :: with_capacity ( db . vector_size ( ) ) ;
unsafe { qv . set_len ( db . vector_size ( ) ) ; }
let mut rng = rand :: thread_rng ( ) ;
rng . fill ( & mut qv [ .. ] ) ;
for r in 0 .. 2 { // second round should run faster
let time = std :: time :: Instant :: now ( ) ;
let results = db . query (
& qv ,
K . try_into ( ) . unwrap ( ) ,
NPROBE . try_into ( ) . unwrap ( ) ,
) . await . unwrap ( ) ;
println ! ( "[{}] queried k-NN in {} s" , r , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let time = std :: time :: Instant :: now ( ) ;
for ( i , result ) in results . into_iter ( ) . enumerate ( ) {
// getting attributes will incur additional disk reads
let attr = result . get_attribute ( "datum_id" ) . await . unwrap ( ) ;
println ! (
" t {}: partition={}, approx. distance²={}, datum_id={:?}" ,
i ,
result . partition_index ,
result . squared_distance ,
attr ,
) ;
}
println ! (
"[{}] printed results in {} s" ,
r ,
time . elapsed ( ) . as_secs_f32 ( ) ,
) ;
}
} 완전한 예는 examples/query-async 폴더에 있습니다.
참고 : 내 컴퓨터의 출력 (Apple M1 Pro, 32GB RAM, 1TB SSD) :
loaded database in 0.000170959 s
[0] queried k-NN in 0.008041208 s
0: partition=67, approx. distance²=128.50703, datum_id=Some(Uint64(69632))
1: partition=9, approx. distance²=129.98079, datum_id=Some(Uint64(73093))
2: partition=9, approx. distance²=130.10867, datum_id=Some(Uint64(7536))
3: partition=20, approx. distance²=130.29523, datum_id=Some(Uint64(67750))
4: partition=67, approx. distance²=130.71976, datum_id=Some(Uint64(77054))
5: partition=9, approx. distance²=130.80556, datum_id=Some(Uint64(93180))
6: partition=9, approx. distance²=130.90681, datum_id=Some(Uint64(22473))
7: partition=9, approx. distance²=130.94006, datum_id=Some(Uint64(40167))
8: partition=67, approx. distance²=130.9795, datum_id=Some(Uint64(8590))
9: partition=9, approx. distance²=131.03018, datum_id=Some(Uint64(53138))
[0] printed results in 0.00194175 s
[1] queried k-NN in 0.000789417 s
0: partition=67, approx. distance²=128.50703, datum_id=Some(Uint64(69632))
1: partition=9, approx. distance²=129.98079, datum_id=Some(Uint64(73093))
2: partition=9, approx. distance²=130.10867, datum_id=Some(Uint64(7536))
3: partition=20, approx. distance²=130.29523, datum_id=Some(Uint64(67750))
4: partition=67, approx. distance²=130.71976, datum_id=Some(Uint64(77054))
5: partition=9, approx. distance²=130.80556, datum_id=Some(Uint64(93180))
6: partition=9, approx. distance²=130.90681, datum_id=Some(Uint64(22473))
7: partition=9, approx. distance²=130.94006, datum_id=Some(Uint64(40167))
8: partition=67, approx. distance²=130.9795, datum_id=Some(Uint64(8590))
9: partition=9, approx. distance²=131.03018, datum_id=Some(Uint64(53138))
[1] printed results in 0.000011084 s
보다 현실적인 데이터에 대한 벤치 마크가 있습니다.
https://codemonger-io.github.io/flechasdb/api/flechasdb/
flechasdb 이 기사에 설명 된 Indexivfpq를 구현합니다.
flechasdb K-Means ++를 구현하여 Näive k-means 클러스터링의 중심을 초기화합니다.
TBD
cargo buildcargo doc --lib --no-deps --release피네콘
완전히 관리되는 벡터 데이터베이스.
밀버
많은 기능을 갖춘 오픈 소스 벡터 데이터베이스.
lancedb
그들의 기능 중 하나는 서버리스 이며 코어는 Rust로 작성됩니다!
MIT
Codemonger의 다음 자료는 CC By-SA 4.0에 따라 라이센스가 부여됩니다.