
Um banco de dados vetorial sem servidor sem servidor em suas mãos.
O pacote flechasdb é a biblioteca principal do sistema FlechasDB escrito em ferrugem.
O sistema FlechasDB pretende ser um banco de dados vetorial que se encaixa perfeitamente em ambientes sem servidor. O credo do sistema FlechasDB é simples; Não requer o servidor dedicado em execução continuamente .
*: Fornecido por outro pacote flechasdb-s3 .
Ainda não há uma caixa publicada. Adicione a seguinte linha ao seu arquivo Cargo.toml :
[ dependencies ]
flechasdb = { git = " https://github.com/codemonger-io/flechasdb.git " }Aqui está um exato da construção de um banco de dados vetorial a partir de vetores gerados aleatoriamente.
use rand :: Rng ;
use flechasdb :: db :: build :: {
DatabaseBuilder ,
proto :: serialize_database ,
} ;
use flechasdb :: io :: LocalFileSystem ;
use flechasdb :: vector :: BlockVectorSet ;
fn main ( ) {
const M : usize = 100000 ; // number of vectors
const N : usize = 1536 ; // vector size
const D : usize = 12 ; // number of subvector divisions
const P : usize = 100 ; // number of partitions
const C : usize = 256 ; // number of clusters for product quantization
let time = std :: time :: Instant :: now ( ) ;
let mut data : Vec < f32 > = Vec :: with_capacity ( M * N ) ;
unsafe { data . set_len ( M * N ) ; }
let mut rng = rand :: thread_rng ( ) ;
rng . fill ( & mut data [ .. ] ) ;
let vs = BlockVectorSet :: chunk ( data , N . try_into ( ) . unwrap ( ) ) . unwrap ( ) ;
println ! ( "prepared data in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let time = std :: time :: Instant :: now ( ) ;
let mut db = DatabaseBuilder :: new ( vs )
. with_partitions ( P . try_into ( ) . unwrap ( ) )
. with_divisions ( D . try_into ( ) . unwrap ( ) )
. with_clusters ( C . try_into ( ) . unwrap ( ) )
. build ( )
. unwrap ( ) ;
println ! ( "built database in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
for i in 0 .. M {
db . set_attribute_at ( i , ( "datum_id" , i as u64 ) ) . unwrap ( ) ;
}
let time = std :: time :: Instant :: now ( ) ;
serialize_database ( & db , & mut LocalFileSystem :: new ( "testdb" ) ) . unwrap ( ) ;
println ! ( "serialized database in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
} Você pode encontrar o exemplo completo na pasta examples/build-random .
FYI: Demorou um pouco na minha máquina (Apple M1 Pro, 32 GB de RAM, 1 TB SSD).
prepared data in 0.9123601 s
built database in 906.51526 s
serialized database in 0.14329213 s
Aqui está um exemplo de carregamento de um banco de dados vetorial e consulta um vetor gerado aleatoriamente para os vizinhos mais parecidos (K-NN).
use rand :: Rng ;
use std :: env :: args ;
use std :: path :: Path ;
use flechasdb :: db :: stored :: { Database , LoadDatabase } ;
use flechasdb :: io :: LocalFileSystem ;
fn main ( ) {
const K : usize = 10 ; // k-nearest neighbors
const NPROBE : usize = 5 ; // number of partitions to query
let time = std :: time :: Instant :: now ( ) ;
let db_path = args ( ) . nth ( 1 ) . expect ( "no db path given" ) ;
let db_path = Path :: new ( & db_path ) ;
let db = Database :: < f32 , _ > :: load_database (
LocalFileSystem :: new ( db_path . parent ( ) . unwrap ( ) ) ,
db_path . file_name ( ) . unwrap ( ) . to_str ( ) . unwrap ( ) ,
) . unwrap ( ) ;
println ! ( "loaded database in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let mut qv : Vec < f32 > = Vec :: with_capacity ( db . vector_size ( ) ) ;
unsafe { qv . set_len ( db . vector_size ( ) ) ; }
let mut rng = rand :: thread_rng ( ) ;
rng . fill ( & mut qv [ .. ] ) ;
for r in 0 .. 2 { // second round should run faster
let time = std :: time :: Instant :: now ( ) ;
let results = db . query (
& qv ,
K . try_into ( ) . unwrap ( ) ,
NPROBE . try_into ( ) . unwrap ( ) ,
) . unwrap ( ) ;
println ! ( "[{}] queried k-NN in {} s" , r , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let time = std :: time :: Instant :: now ( ) ;
for ( i , result ) in results . into_iter ( ) . enumerate ( ) {
// getting attributes will incur additional disk reads
let attr = result . get_attribute ( "datum_id" ) . unwrap ( ) ;
println ! (
" t {}: partition={}, approx. distance²={}, datum_id={:?}" ,
i ,
result . partition_index ,
result . squared_distance ,
attr ,
) ;
}
println ! (
"[{}] printed results in {} s" ,
r ,
time . elapsed ( ) . as_secs_f32 ( ) ,
) ;
}
} Você pode encontrar o exemplo completo na pasta examples/query-sync .
FYI: Saídas na minha máquina (Apple M1 Pro, 32 GB de RAM, 1 TB SSD):
loaded database in 0.000142083 s
[0] queried k-NN in 0.0078015 s
0: partition=95, approx. distance²=126.23533, datum_id=Some(Uint64(90884))
1: partition=29, approx. distance²=127.76597, datum_id=Some(Uint64(30864))
2: partition=95, approx. distance²=127.80611, datum_id=Some(Uint64(75236))
3: partition=56, approx. distance²=127.808174, datum_id=Some(Uint64(27890))
4: partition=25, approx. distance²=127.85459, datum_id=Some(Uint64(16417))
5: partition=95, approx. distance²=127.977425, datum_id=Some(Uint64(70910))
6: partition=25, approx. distance²=128.06209, datum_id=Some(Uint64(3237))
7: partition=95, approx. distance²=128.22603, datum_id=Some(Uint64(41942))
8: partition=79, approx. distance²=128.26906, datum_id=Some(Uint64(89799))
9: partition=25, approx. distance²=128.27995, datum_id=Some(Uint64(6593))
[0] printed results in 0.003392833 s
[1] queried k-NN in 0.001475625 s
0: partition=95, approx. distance²=126.23533, datum_id=Some(Uint64(90884))
1: partition=29, approx. distance²=127.76597, datum_id=Some(Uint64(30864))
2: partition=95, approx. distance²=127.80611, datum_id=Some(Uint64(75236))
3: partition=56, approx. distance²=127.808174, datum_id=Some(Uint64(27890))
4: partition=25, approx. distance²=127.85459, datum_id=Some(Uint64(16417))
5: partition=95, approx. distance²=127.977425, datum_id=Some(Uint64(70910))
6: partition=25, approx. distance²=128.06209, datum_id=Some(Uint64(3237))
7: partition=95, approx. distance²=128.22603, datum_id=Some(Uint64(41942))
8: partition=79, approx. distance²=128.26906, datum_id=Some(Uint64(89799))
9: partition=25, approx. distance²=128.27995, datum_id=Some(Uint64(6593))
[1] printed results in 0.0000215 s
Aqui está um exemplo de carregamento assíncrono de um banco de dados vetorial e consultar um vetor gerado aleatoriamente para K-NN.
use rand :: Rng ;
use std :: env :: args ;
use std :: path :: Path ;
use flechasdb :: asyncdb :: io :: LocalFileSystem ;
use flechasdb :: asyncdb :: stored :: { Database , LoadDatabase } ;
# [ tokio :: main ]
async fn main ( ) {
const K : usize = 10 ; // k-nearest neighbors
const NPROBE : usize = 5 ; // number of partitions to search
let time = std :: time :: Instant :: now ( ) ;
let db_path = args ( ) . nth ( 1 ) . expect ( "missing db path" ) ;
let db_path = Path :: new ( & db_path ) ;
let db = Database :: < f32 , _ > :: load_database (
LocalFileSystem :: new ( db_path . parent ( ) . unwrap ( ) ) ,
db_path . file_name ( ) . unwrap ( ) . to_str ( ) . unwrap ( ) ,
) . await . unwrap ( ) ;
println ! ( "loaded database in {} s" , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let mut qv = Vec :: with_capacity ( db . vector_size ( ) ) ;
unsafe { qv . set_len ( db . vector_size ( ) ) ; }
let mut rng = rand :: thread_rng ( ) ;
rng . fill ( & mut qv [ .. ] ) ;
for r in 0 .. 2 { // second round should run faster
let time = std :: time :: Instant :: now ( ) ;
let results = db . query (
& qv ,
K . try_into ( ) . unwrap ( ) ,
NPROBE . try_into ( ) . unwrap ( ) ,
) . await . unwrap ( ) ;
println ! ( "[{}] queried k-NN in {} s" , r , time . elapsed ( ) . as_secs_f32 ( ) ) ;
let time = std :: time :: Instant :: now ( ) ;
for ( i , result ) in results . into_iter ( ) . enumerate ( ) {
// getting attributes will incur additional disk reads
let attr = result . get_attribute ( "datum_id" ) . await . unwrap ( ) ;
println ! (
" t {}: partition={}, approx. distance²={}, datum_id={:?}" ,
i ,
result . partition_index ,
result . squared_distance ,
attr ,
) ;
}
println ! (
"[{}] printed results in {} s" ,
r ,
time . elapsed ( ) . as_secs_f32 ( ) ,
) ;
}
} O exemplo completo está na pasta examples/query-async .
FYI: Saídas na minha máquina (Apple M1 Pro, 32 GB de RAM, 1 TB SSD):
loaded database in 0.000170959 s
[0] queried k-NN in 0.008041208 s
0: partition=67, approx. distance²=128.50703, datum_id=Some(Uint64(69632))
1: partition=9, approx. distance²=129.98079, datum_id=Some(Uint64(73093))
2: partition=9, approx. distance²=130.10867, datum_id=Some(Uint64(7536))
3: partition=20, approx. distance²=130.29523, datum_id=Some(Uint64(67750))
4: partition=67, approx. distance²=130.71976, datum_id=Some(Uint64(77054))
5: partition=9, approx. distance²=130.80556, datum_id=Some(Uint64(93180))
6: partition=9, approx. distance²=130.90681, datum_id=Some(Uint64(22473))
7: partition=9, approx. distance²=130.94006, datum_id=Some(Uint64(40167))
8: partition=67, approx. distance²=130.9795, datum_id=Some(Uint64(8590))
9: partition=9, approx. distance²=131.03018, datum_id=Some(Uint64(53138))
[0] printed results in 0.00194175 s
[1] queried k-NN in 0.000789417 s
0: partition=67, approx. distance²=128.50703, datum_id=Some(Uint64(69632))
1: partition=9, approx. distance²=129.98079, datum_id=Some(Uint64(73093))
2: partition=9, approx. distance²=130.10867, datum_id=Some(Uint64(7536))
3: partition=20, approx. distance²=130.29523, datum_id=Some(Uint64(67750))
4: partition=67, approx. distance²=130.71976, datum_id=Some(Uint64(77054))
5: partition=9, approx. distance²=130.80556, datum_id=Some(Uint64(93180))
6: partition=9, approx. distance²=130.90681, datum_id=Some(Uint64(22473))
7: partition=9, approx. distance²=130.94006, datum_id=Some(Uint64(40167))
8: partition=67, approx. distance²=130.9795, datum_id=Some(Uint64(8590))
9: partition=9, approx. distance²=131.03018, datum_id=Some(Uint64(53138))
[1] printed results in 0.000011084 s
Há uma referência em dados mais realistas.
https://codemonger-io.github.io/flechasdb/api/flechasdb/
flechasdb implementa indexivfpq descrito neste artigo.
flechasdb implementa o K-Means ++ para inicializar os centróides para o agrupamento Näive K-Means.
TBD
cargo buildcargo doc --lib --no-deps --releasePinecone
Banco de dados vetorial totalmente gerenciado.
Milvus
Banco de dados vetorial de código aberto com muitos recursos.
Lancedb
Uma de suas características também é sem servidor , e seu núcleo está escrito em ferrugem!
Mit
O seguinte material do Codemonger é licenciado no CC BY-SA 4.0: