feat: Add support for pgvector's vector data type

amotl · amotl · commit ea087409d43f · 2023-12-14T01:31:08.000+01:00
diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
@@ -36,7 +36,7 @@ jobs:
         pipx install poetry
     - name: Install dependencies
       run: |
-        poetry install
+        poetry install --all-extras
     - name: Run pytest
       run: |
         poetry run pytest --capture=no
diff --git a/README.md b/README.md
@@ -102,7 +102,7 @@ tap-carbon-intensity | target-postgres --config /path/to/target-postgres-config.
 
 ```bash
 pipx install poetry
-poetry install
+poetry install --all-extras
 pipx install pre-commit
 pre-commit install
 ```
@@ -152,6 +152,8 @@ develop your own Singer taps and targets.
 
 ## Data Types
 
+### Mapping
+
 The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes.
 
 | jsonschema                     | Postgres                                |
@@ -202,7 +204,20 @@ The below table shows how this tap will map between jsonschema datatypes and Pos
 
 Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array.
 
-If a column has multiple jsonschema types, the following order is using to order Postgres types, from highest priority to lowest priority.
+When using [pgvector], this type mapping applies, additionally to the table above.
+
+| jsonschema                                     | Postgres |
+|------------------------------------------------|----------|
+| array (with additional SCHEMA annotations [1]) | vector   |
+
+[1] `"additionalProperties": {"storage": {"type": "vector", "dim": 4}}`
+
+### Resolution Order
+
+If a column has multiple jsonschema types, there is a priority list for
+resolving the best type candidate, from the highest priority to the
+lowest priority.
+
 - ARRAY(JSONB)
 - JSONB
 - TEXT
@@ -215,3 +230,9 @@ If a column has multiple jsonschema types, the following order is using to order
 - INTEGER
 - BOOLEAN
 - NOTYPE
+
+When using [pgvector], the `pgvector.sqlalchemy.Vector` type is added to the bottom
+of the list.
+
+
+[pgvector]: https://github.com/pgvector/pgvector
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -3,7 +3,7 @@
 version: "2.1"
 services:
   postgres:
-    image: docker.io/postgres:latest
+    image: ankane/pgvector:latest
     command: postgres -c ssl=on -c ssl_cert_file=/var/lib/postgresql/server.crt -c ssl_key_file=/var/lib/postgresql/server.key -c ssl_ca_file=/var/lib/postgresql/ca.crt -c hba_file=/var/lib/postgresql/pg_hba.conf
     environment:
       POSTGRES_USER: postgres
@@ -13,16 +13,19 @@ services:
       POSTGRES_INITDB_ARGS: --auth-host=cert
     # Not placed in the data directory (/var/lib/postgresql/data) because of https://gist.github.com/mrw34/c97bb03ea1054afb551886ffc8b63c3b?permalink_comment_id=2678568#gistcomment-2678568
     volumes:
+      - ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
       - ./ssl/server.crt:/var/lib/postgresql/server.crt # Certificate verifying the server's identity to the client.
       - ./ssl/server.key:/var/lib/postgresql/server.key # Private key to verify the server's certificate is legitimate.
       - ./ssl/ca.crt:/var/lib/postgresql/ca.crt # Certificate authority to use when verifying the client's identity to the server.
       - ./ssl/pg_hba.conf:/var/lib/postgresql/pg_hba.conf # Configuration file to allow connection over SSL.
     ports:
       - "5432:5432"
   postgres_no_ssl: # Borrowed from https://github.com/MeltanoLabs/tap-postgres/blob/main/.github/workflows/test.yml#L13-L23
-    image: docker.io/postgres:latest
+    image: ankane/pgvector:latest
     environment:
       POSTGRES_PASSWORD: postgres
+    volumes:
+      - ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
     ports:
       - 5433:5432
   ssh:
@@ -37,17 +40,20 @@ services:
       - PASSWORD_ACCESS=false
       - USER_NAME=melty
     volumes:
+      - ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
       - ./ssh_tunnel/ssh-server-config:/config/ssh_host_keys:ro
     ports:
       - "127.0.0.1:2223:2222"
     networks:
       - inner
   postgresdb:
-    image: postgres:13.0
+    image: ankane/pgvector:latest
     environment:
       POSTGRES_USER: postgres
       POSTGRES_PASSWORD: postgres
       POSTGRES_DB: main
+    volumes:
+      - ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
     networks:
       inner:
         ipv4_address: 10.5.0.5
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ packages = [
 python = "<3.12,>=3.8.1"
 requests = "^2.25.1"
 singer-sdk = ">=0.28,<0.34"
+pgvector = { version="^0.2.4", optional = true }
 psycopg2-binary = "2.9.9"
 sqlalchemy = ">=2.0,<3.0"
 sshtunnel = "0.4.0"
@@ -50,11 +51,17 @@ types-simplejson = "^3.19.0.2"
 types-sqlalchemy = "^1.4.53.38"
 types-jsonschema = "^4.19.0.3"
 
+[tool.poetry.extras]
+pgvector = ["pgvector"]
+
 [tool.mypy]
 exclude = "tests"
 
 [[tool.mypy.overrides]]
-module = ["sshtunnel"]
+module = [
+    "pgvector.sqlalchemy",
+    "sshtunnel",
+]
 ignore_missing_imports = true
 
 [tool.isort]
diff --git a/target_postgres/connector.py b/target_postgres/connector.py
@@ -114,6 +114,14 @@ def prepare_table(  # type: ignore[override]
                 connection=connection,
             )
             return table
+        # To make table reflection work properly with pgvector,
+        # the module needs to be imported beforehand.
+        try:
+            from pgvector.sqlalchemy import Vector  # noqa: F401
+        except ImportError:
+            self.logger.debug(
+                "Unable to handle pgvector's `Vector` type. Please install `pgvector`."
+            )
         meta.reflect(connection, only=[table_name])
         table = meta.tables[
             full_table_name
@@ -277,6 +285,51 @@ def pick_individual_type(jsonschema_type: dict):
         if "object" in jsonschema_type["type"]:
             return JSONB()
         if "array" in jsonschema_type["type"]:
+            # Select between different kinds of `ARRAY` data types.
+            #
+            # This currently leverages an unspecified definition for the Singer SCHEMA,
+            # using the `additionalProperties` attribute to convey additional type
+            # information, agnostic of the target database.
+            #
+            # In this case, it is about telling different kinds of `ARRAY` types apart:
+            # Either it is a vanilla `ARRAY`, to be stored into a `jsonb[]` type, or,
+            # alternatively, it can be a "vector" kind `ARRAY` of floating point
+            # numbers, effectively what pgvector is storing in its `VECTOR` type.
+            #
+            # Still, `type: "vector"` is only a surrogate label here, because other
+            # database systems may use different types for implementing the same thing,
+            # and need to translate accordingly.
+            """
+            Schema override rule in `meltano.yml`:
+
+            type: "array"
+            items:
+              type: "number"
+            additionalProperties:
+              storage:
+                type: "vector"
+                dim: 4
+
+            Produced schema annotation in `catalog.json`:
+
+            {"type": "array",
+             "items": {"type": "number"},
+             "additionalProperties": {"storage": {"type": "vector", "dim": 4}}}
+            """
+            if (
+                "additionalProperties" in jsonschema_type
+                and "storage" in jsonschema_type["additionalProperties"]
+            ):
+                storage_properties = jsonschema_type["additionalProperties"]["storage"]
+                if (
+                    "type" in storage_properties
+                    and storage_properties["type"] == "vector"
+                ):
+                    # On PostgreSQL/pgvector, use the corresponding type definition
+                    # from its SQLAlchemy dialect.
+                    from pgvector.sqlalchemy import Vector
+
+                    return Vector(storage_properties["dim"])
             return ARRAY(JSONB())
         if jsonschema_type.get("format") == "date-time":
             return TIMESTAMP()
@@ -310,6 +363,13 @@ def pick_best_sql_type(sql_type_array: list):
             NOTYPE,
         ]
 
+        try:
+            from pgvector.sqlalchemy import Vector
+
+            precedence_order.append(Vector)
+        except ImportError:
+            pass
+
         for sql_type in precedence_order:
             for obj in sql_type_array:
                 if isinstance(obj, sql_type):
@@ -516,7 +576,7 @@ def _adapt_column_type(  # type: ignore[override]
             return
 
         # Not the same type, generic type or compatible types
-        # calling merge_sql_types for assistnace
+        # calling merge_sql_types for assistance.
         compatible_sql_type = self.merge_sql_types([current_type, sql_type])
 
         if str(compatible_sql_type) == str(current_type):
diff --git a/target_postgres/tests/data_files/array_float_vector.singer b/target_postgres/tests/data_files/array_float_vector.singer
@@ -0,0 +1,5 @@
+{"type": "SCHEMA", "stream": "array_float_vector", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}, "additionalProperties": {"storage": {"type": "vector", "dim": 4}}}}}}
+{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]}}
+{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]}}
+{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]}}
+{"type": "STATE", "value": {"array_float_vector": 3}}
diff --git a/target_postgres/tests/init.sql b/target_postgres/tests/init.sql
@@ -0,0 +1 @@
+CREATE EXTENSION IF NOT EXISTS vector;
diff --git a/target_postgres/tests/test_target_postgres.py b/target_postgres/tests/test_target_postgres.py
@@ -458,6 +458,27 @@ def test_array_boolean(postgres_target):
     )
 
 
+def test_array_float_vector(postgres_target):
+    pgvector_sa = pytest.importorskip("pgvector.sqlalchemy")
+
+    file_name = "array_float_vector.singer"
+    singer_file_to_target(file_name, postgres_target)
+    row = {
+        "id": 1,
+        "value": "[1.1,2.1,1.1,1.3]",
+    }
+    verify_data(postgres_target, "array_float_vector", 3, "id", row)
+
+    verify_schema(
+        postgres_target,
+        "array_float_vector",
+        check_columns={
+            "id": {"type": BIGINT},
+            "value": {"type": pgvector_sa.Vector},
+        },
+    )
+
+
 def test_array_number(postgres_target):
     file_name = "array_number.singer"
     singer_file_to_target(file_name, postgres_target)
diff --git a/tox.ini b/tox.ini
@@ -9,7 +9,7 @@ isolated_build = true
 allowlist_externals = poetry
 
 commands =
-    poetry install -v
+    poetry install --all-extras -v
     poetry run pytest
     poetry run black --check target_postgres/
     poetry run flake8 target_postgres
@@ -21,22 +21,22 @@ commands =
 # To execute, run `tox -e pytest`
 envlist = py37, py38, py39
 commands =
-    poetry install -v
+    poetry install --all-extras -v
     poetry run pytest
 
 [testenv:format]
 # Attempt to auto-resolve lint errors before they are raised.
 # To execute, run `tox -e format`
 commands =
-    poetry install -v
+    poetry install --all-extras -v
     poetry run black target_postgres/
     poetry run isort target_postgres
 
 [testenv:lint]
 # Raise an error if lint and style standards are not met.
 # To execute, run `tox -e lint`
 commands =
-    poetry install -v
+    poetry install --all-extras -v
     poetry run black --check --diff target_postgres/
     poetry run isort --check target_postgres
     poetry run flake8 target_postgres