diff --git a/gateway/main.py b/gateway/main.py index f196249..27b0913 100644 --- a/gateway/main.py +++ b/gateway/main.py @@ -246,6 +246,7 @@ async def vectors_search_file( hnsw_ef: Optional[int] = Form(None), exact: bool = Form(False), indexed_only: bool = Form(False), + filter_metadata_json: Optional[str] = Form(None), ): data = await file.read() fields: Dict[str, Any] = {"limit": int(limit), "exact": exact, "indexed_only": indexed_only} @@ -255,6 +256,8 @@ async def vectors_search_file( fields["collection"] = collection if hnsw_ef is not None: fields["hnsw_ef"] = int(hnsw_ef) + if filter_metadata_json is not None: + fields["filter_metadata_json"] = filter_metadata_json async with httpx.AsyncClient(timeout=VISION_TIMEOUT) as client: return await _post_file(client, f"{QDRANT_SVC_URL}/search/file", data, fields) diff --git a/qdrant/main.py b/qdrant/main.py index 4206e77..a7276ce 100644 --- a/qdrant/main.py +++ b/qdrant/main.py @@ -20,6 +20,8 @@ from qdrant_client.models import ( OptimizersConfigDiff, SearchParams, PayloadSchemaType, + ScalarQuantizationConfig, + ScalarType, ) # --------------------------------------------------------------------------- @@ -298,9 +300,13 @@ def create_collection(req: CollectionRequest): if req.name in collections: raise HTTPException(409, f"Collection '{req.name}' already exists") + # Apply the same production defaults as _ensure_collection so all + # collections start with tuned HNSW and optimizer settings. client.create_collection( collection_name=req.name, vectors_config=VectorParams(size=req.vector_dim, distance=dist), + hnsw_config=HnswConfigDiff(m=16, ef_construct=200, on_disk=False), + optimizers_config=OptimizersConfigDiff(indexing_threshold=20000, default_segment_number=4), ) return {"created": req.name, "vector_dim": req.vector_dim, "distance": req.distance} @@ -460,11 +466,19 @@ async def search_file( hnsw_ef: Optional[int] = Form(None), exact: bool = Form(False), indexed_only: bool = Form(False), + filter_metadata_json: Optional[str] = Form(None), ): """Embed an uploaded image via CLIP, then search Qdrant for similar vectors.""" + import json + filter_metadata: Dict[str, Any] = {} + if filter_metadata_json: + try: + filter_metadata = json.loads(filter_metadata_json) + except json.JSONDecodeError: + raise HTTPException(400, "filter_metadata_json must be valid JSON") data = await file.read() vector = await _embed_bytes(data) - return _do_search(vector, int(limit), score_threshold, collection, {}, hnsw_ef, exact, indexed_only) + return _do_search(vector, int(limit), score_threshold, collection, filter_metadata, hnsw_ef, exact, indexed_only) @app.post("/search/vector") @@ -685,6 +699,12 @@ class CollectionConfigRequest(BaseModel): hnsw_on_disk: Optional[bool] = Field(default=None, description="Store HNSW graph on disk (saves RAM, slightly slower queries).") indexing_threshold: Optional[int] = Field(default=None, ge=0, description="Min payload changes before a segment is indexed.") default_segment_number: Optional[int] = Field(default=None, ge=1, le=32, description="Target number of segments for parallelism.") + # Scalar quantization — reduces RAM ~4x, often speeds up search on large collections. + # Set quantization_type='int8' to enable. Use always_ram=True to keep quantized + # vectors in RAM (recommended on VPS with limited memory but fast disk). + quantization_type: Optional[str] = Field(default=None, description="Enable scalar quantization: 'int8'. Set to null to keep current setting.") + quantization_quantile: float = Field(default=0.99, ge=0.5, le=1.0, description="Fraction of vectors used to calibrate quantization range (0.99 recommended).") + quantization_always_ram: bool = Field(default=True, description="Keep quantized vectors in RAM even when raw vectors are on disk.") @app.post("/collections/{name}/configure") @@ -705,7 +725,18 @@ def configure_collection(name: str, req: CollectionConfigRequest): "default_segment_number": req.default_segment_number, }.items() if v is not None} - if not hnsw_kwargs and not opt_kwargs: + # Build optional scalar quantization config + quant_config = None + if req.quantization_type is not None: + if req.quantization_type.lower() != "int8": + raise HTTPException(400, f"Unsupported quantization_type '{req.quantization_type}'. Only 'int8' is supported.") + quant_config = ScalarQuantizationConfig( + type=ScalarType.INT8, + quantile=req.quantization_quantile, + always_ram=req.quantization_always_ram, + ) + + if not hnsw_kwargs and not opt_kwargs and quant_config is None: raise HTTPException(400, "No configuration fields provided") try: @@ -713,12 +744,14 @@ def configure_collection(name: str, req: CollectionConfigRequest): collection_name=name, hnsw_config=HnswConfigDiff(**hnsw_kwargs) if hnsw_kwargs else None, optimizers_config=OptimizersConfigDiff(**opt_kwargs) if opt_kwargs else None, + quantization_config=quant_config, ) return { "collection": name, "status": "updated", "hnsw_changes": hnsw_kwargs, "optimizer_changes": opt_kwargs, + "quantization": {"type": req.quantization_type, "quantile": req.quantization_quantile, "always_ram": req.quantization_always_ram} if quant_config else None, } except Exception as exc: raise HTTPException(500, str(exc))