【Faiss】基础使用:聚类,降维,量化
聚类import faissimport pickleimport numpy as npimport timex = np.random.random((100000, 2048)).astype('float32')ncentroids = 1000niter = 500verbose = Trued = x.shape[1]start_time = time.time()'''d:向量维度n
·
聚类
import faiss
import pickle
import numpy as np
import time
x = np.random.random((100000, 2048)).astype('float32')
ncentroids = 1000
niter = 500
verbose = True
d = x.shape[1]
start_time = time.time()
'''
d:向量维度
ncentroids:聚类中心
niter:迭代次数
verbose:是否打印迭代情况
gpu:是否使用GPU
'''
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose, gpu=True)
kmeans.train(x)
train_time = time.time()
print(train_time - start_time)
cluster_cents = kmeans.centroids
cluster_wucha = kmeans.obj
D, I = kmeans.index.search(x, 1)
print(np.unique(np.array(I))) # 共有1000张数据,形状为[1000,2048]
search_time = time.time()
print(search_time - train_time)
# # 也可以创建一个检索器,然后搜索出离这些中心点最近的15个向量
# index = faiss.IndexFlatL2 (d)
# index.add (x)
# D, I = index.search (kmeans.centroids, 15)
降维(PCA)
从40维向量减低到10维向量。
import faiss
import numpy as np
# random training data
mt = np.random.rand(1000, 40).astype('float32')
mat = faiss.PCAMatrix(40, 10)
mat.train(mt)
assert mat.is_trained
tr = mat.apply_py(mt)
# print this to show that the magnitude of tr's columns is decreasing
print((tr ** 2).sum(0))
如何从PCA对象中得到Numpy中的PCA矩阵?
看见从PCA.ipynb获取矩阵。这适用于任何
LinearTransform
对象。import faiss import numpy as np # training data xt = np.random.rand(1000, 20).astype('float32') # test data x = np.random.rand(10, 20).astype('float32') # make the PCA matrix pca = faiss.PCAMatrix(20, 10) pca.train(xt) # apply it to test data yref = pca.apply_py(x) # extract matrix + bias from the PCA object # works for any linear transform (OPQ, random rotation, etc.) b = faiss.vector_to_array(pca.b) A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) # apply transformation ynew = x @ A.T + b # are the vectors the same? print(np.allclose(yref, ynew))
量化
其实就是将数据进行编码,然后用这个编码代替这个数据,从而降低数据对资源的负担。
PQ encoding / decoding
'''
这个ProductQuantizer对象可用于将矢量编码或解码为代码
'''
import numpy as np
import faiss
d = 32 # data dimension
cs = 4 # code size (bytes)
# train set
nt = 10000
xt = np.random.rand(nt, d).astype('float32')
# dataset to encode (could be same as train)
n = 20000
x = np.random.rand(n, d).astype('float32')
pq = faiss.ProductQuantizer(d, cs, 8) ##########这个8不知道什么意思,难道是指8位?
pq.train(xt)
# encode
codes = pq.compute_codes(x)
# decode
x2 = pq.decode(codes)
# compute reconstruction error
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(avg_relative_error)
如何从ProductQuantizer对象获取/更改质心?
scalar quantizer
import numpy as np
import faiss
d = 32 # data dimension
# train set
nt = 10000
xt = np.random.rand(nt, d).astype('float32')
# dataset to encode (could be same as train)
n = 20000
x = np.random.rand(n, d).astype('float32')
# QT_8bit allocates 8 bits per dimension (QT_4bit also works)
sq = faiss.ScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
sq.train(xt)
# encode
codes = sq.compute_codes(x)
# decode
x2 = sq.decode(codes)
# compute reconstruction error
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(avg_relative_error)
更多推荐
所有评论(0)