Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Python native SDK (without SQL) #528

Open
gaocegege opened this issue Jul 12, 2024 · 1 comment
Open

feat: Python native SDK (without SQL) #528

gaocegege opened this issue Jul 12, 2024 · 1 comment
Assignees

Comments

@gaocegege
Copy link
Member

No description provided.

@cutecutecat
Copy link
Member

Design of native SDK

Manage Databases

Abilities

  • Connect to database
from sdk import PGVectoClient 
client = PGVectoClient(host="127.0.0.1", port=19530, user_name="postgres", db_name="postgres", password="")

Manage Schema

Concept

Supported data types:

  • basic types
    • str -> TEXT
    • int -> INTEGER
    • float -> REAL
    • datetime -> TIMESTAMP
    • dict -> jsonb
  • vector types
    • vector
    • svector
    • bvector
    • vecf16

Column attributes:

  • is_primary for Field
  • dim for VectorField

Abilities

  • Create Field(schema for column)
  • Collect all Field to a Schema
  • Schema can have an optional Partition
from sdk import Field, VectorField, Schema, DataType

id_field = Field(name="id", dtype=DataType.INT, is_primary=True, description="primary id")
age_field = Field(name="age", dtype=DataType.INT, description="age")
embedding_field = VectorField(name="embedding", dtype=DataType.VECTOR, dim=128, description="vector")
position_field = Field(name="position", dtype=DataType.TEXT)

schema = Schema(fields=[id_field, age_field, embedding_field], auto_id=False, description="desc of a collection", partition=None)

Manage Collections

Concept

  • Quick setup mode: Only create vector and id and $meta

Abilities

  • Quick setup mode and custom mode
  • Drop a collection
# Quick setup mode without schema, with columns: id(int), vector(Vector) and meta(jsonb)
client.create_basic_collection(
    collection_name="quick_setup",
    dimension=5,
)

# Custom mode: create columns by schema
client.create_collection(
    collection_name="customized_setup",
    schema=schema,
)

client.drop_collection(
    collection_name="customized_setup"
)

Data Insert

Abilities

  • Insert list[dict[str, Any]] into collection
data=[
    {"id": 0, "vector": [0.3580376395471989, -0.6023495712049978, 0.18414012509913835, -0.26286205330961354, 0.9029438446296592], "color": "pink_8682"},
    {"id": 1, "vector": [0.19886812562848388, 0.06023560599112088, 0.6976963061752597, 0.2614474506242501, 0.838729485096104], "color": "red_7025"},
]

client.insert(
    collection_name="quick_setup",
    data=data
)

Update and Delete

Abilities

  • Upsert, as global data replacement, like qdrant and milvus
  • Update where condition like filter='color like "blue%"'
  • Delete by filter
    • by id condition like filter="id in [4,5,6]"
    • by string filter='color like "blue%"'
# UPDATE table SET ... WHERE id=3;
# INSERT INTO table (id, ...)
#        SELECT ...
#        WHERE NOT EXISTS (SELECT 1 FROM table WHERE id=3);

# Insert if id doesn't exist, else update
res = client.upsert(
    collection_name='quick_setup',
    data=data
)

# UPDATE table SET ... WHERE color=pink_8682;
res = client.update(
    collection_name='quick_setup',
    data= {"vector": [0.3580376395471989, -0.6023495712049978, 0.18414012509913835, -0.26286205330961354, 0.9029438446296592], "color": "pink_8682"},
    filter="color = \"pink_8682\"",
)

# DELETE from quick_setup where id != ANY('{18, 19}'::int[])
res = client.delete(
    collection_name="quick_setup",
    ids=[18, 19],
)
res = client.delete(
    collection_name='quick_setup',
    filter='color like "blue%"'
)

Create Index

Concept

  • Metric Types: L2 / IP / COS
  • Index Type: FLAT / IVF_FLAT / HNSW

Abilities

  • Create index for vector columns
  • Cannot create non-vector index
client.create_vector_index(
    collection_name="customized_setup",
    field_name="my_vector",
    metric_type="IP",
    option=IndexOption(...)
)

client.drop_index(
    index_name="idx"
)

Search

Single-Vector Search

  • Default: returns a json of id and distance
{
    "id": 0,
    "distance": 1.4093276262283325,
    "entity": {}
},
{
    "id": 4,
    "distance": 0.9902134537696838,
    "entity": {}
},
from sdk import ANNSearchRequest

req = ANNSearchRequest(
    data: Vector | SparseVector | ...,
    field: str,
    metric_type: str,
    limit: int | None,
    filter: str | None,
    range: float | None,
    group_by_field: str | None,
    outputs: List[str] | None,
    distance_alias: str = "distance",
)

# Single-vector search
# SELECT id, emb <=> [1, 1, 1] as distance from t ORDER BY emb <=> [1, 1, 1] LIMIT 5
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", limit=5)

# Search with extra output fields
# SELECT id, emb <=> [1, 1, 1] as distance, color from t ORDER BY emb <=> [1, 1, 1] LIMIT 5
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", limit=5, outputs=["color"])
# SELECT id, emb <=> [1, 1, 1] as dis, distance from t ORDER BY emb <=> [1, 1, 1] LIMIT 5
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", limit=5, outputs=["distance"], distance_alias="dis")

# Filtered search
# SELECT id, emb <=> [1, 1, 1] as dis, distance from t WHERE age > 5 ORDER BY emb <=> [1, 1, 1]
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", filter="age > 5")

# Range search
# SELECT id, emb <=> [1, 1, 1] as dis, distance from t WHERE emb <<=>> sphere([1, 1, 1], 0.2) ORDER BY emb <=> [1, 1, 1]
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", range=0.2, limit=5)

# Group search: https://milvus.io/docs/single-vector-search.md#Grouping-search
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", limit=10, group_by_field="doc_id",
    output_fields=["doc_id", "passage_id"])

res = client.search(req)

Hybrid search

from sdk import RRFRanker

rerank = RRFRanker()
reqs = [request_1, request_2]

client.hybrid_search(
    reqs,
    rerank,
    limit=2
)

Iterative Search

# Create iterator
res = client.search_iterator(req, batch_size=10)

results = []

# Iter until end
while True:
    result = iterator.next()
    if not result:
        iterator.close()
        break
        
    results.extend(result)

Manage Partitions

Concept

  • Partitions should be created with schema, before collection is created

Abilities

  • multi-tenancy: Hash partition
    • Arbitrary user-id
    • Data Isolation
    • Don't care about partition details
  • User-defined partition: Group & Range
    • Fine-grained control over partition creation and insertion
from sdk.partition import Partition, Hash, In, Range

# Hash partition - Random split inserted rows
# CREATE TABLE partitionA PARTITION OF documents FOR VALUES FROM WITH (MODULUS 3, REMAINDER 0);
 p = Partition(
    partition_name="partitionA",
     partition_field="id",
    partition_by=Hash(3, 0)
)

# Group partition - Split discrete data based on distribution
# CREATE TABLE partitionA PARTITION OF documents FOR VALUES IN ('A', 'B');
p = Partition(
    partition_name="partitionA",
    partition_field="alpha",
    partition_by=In(('A', 'B'))
)

# Range partition - Split continuous data based on distribution
# CREATE TABLE partitionA PARTITION OF documents FOR VALUES FROM ('2023-03-01') TO ('2023-04-01');
p = Partition(
    partition_name="partitionA",
    partition_field="day"
    partition_by=Range('2023-03-01', '2023-04-01')
)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants