Skip to content

Commit

Permalink
add q parameter for free-text search (#295)
Browse files Browse the repository at this point in the history
* add q parameter for free-text search

* handle quoted statements, add more tests
  • Loading branch information
hrodmn committed Aug 19, 2024
1 parent 15f42d8 commit 2f00dbe
Show file tree
Hide file tree
Showing 10 changed files with 5,406 additions and 46 deletions.
132 changes: 91 additions & 41 deletions CHANGELOG.md

Large diffs are not rendered by default.

477 changes: 477 additions & 0 deletions src/pgstac/migrations/pgstac.0.9.1-unreleased.sql

Large diffs are not rendered by default.

4,564 changes: 4,564 additions & 0 deletions src/pgstac/migrations/pgstac.unreleased.sql

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/pgstac/pgstac.sql
68 changes: 68 additions & 0 deletions src/pgstac/sql/004_search.sql
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,59 @@ CREATE OR REPLACE FUNCTION partition_query_view(
$$ LANGUAGE SQL IMMUTABLE;


CREATE OR REPLACE FUNCTION q_to_tsquery (input text)
RETURNS tsquery
AS $$
DECLARE
processed_text text;
temp_text text;
quote_array text[];
placeholder text := '@QUOTE@';
BEGIN
-- Extract all quoted phrases and store in array
quote_array := regexp_matches(input, '"[^"]*"', 'g');

-- Replace each quoted part with a unique placeholder if there are any quoted phrases
IF array_length(quote_array, 1) IS NOT NULL THEN
processed_text := input;
FOR i IN array_lower(quote_array, 1) .. array_upper(quote_array, 1) LOOP
processed_text := replace(processed_text, quote_array[i], placeholder || i || placeholder);
END LOOP;
ELSE
processed_text := input;
END IF;

-- Replace non-quoted text using regular expressions

-- , -> |
processed_text := regexp_replace(processed_text, ',(?=(?:[^"]*"[^"]*")*[^"]*$)', ' | ', 'g');

-- and -> &
processed_text := regexp_replace(processed_text, '\s+AND\s+', ' & ', 'gi');

-- or -> |
processed_text := regexp_replace(processed_text, '\s+OR\s+', ' | ', 'gi');

-- +term -> & term
processed_text := regexp_replace(processed_text, '\+([a-zA-Z0-9_]+)', '& \1', 'g');

-- -term -> ! term
processed_text := regexp_replace(processed_text, '\-([a-zA-Z0-9_]+)', '& ! \1', 'g');

-- Replace placeholders back with quoted phrases if there were any
IF array_length(quote_array, 1) IS NOT NULL THEN
FOR i IN array_lower(quote_array, 1) .. array_upper(quote_array, 1) LOOP
processed_text := replace(processed_text, placeholder || i || placeholder, '''' || substring(quote_array[i] from 2 for length(quote_array[i]) - 2) || '''');
END LOOP;
END IF;

-- Print processed_text to the console for debugging purposes
RAISE NOTICE 'processed_text: %', processed_text;

RETURN to_tsquery(processed_text);
END;
$$
LANGUAGE plpgsql;


CREATE OR REPLACE FUNCTION stac_search_to_where(j jsonb) RETURNS text AS $$
Expand All @@ -140,6 +193,7 @@ DECLARE
edate timestamptz;
filterlang text;
filter jsonb := j->'filter';
ft_query tsquery;
BEGIN
IF j ? 'ids' THEN
where_segments := where_segments || format('id = ANY (%L) ', to_text_array(j->'ids'));
Expand All @@ -161,6 +215,20 @@ BEGIN
);
END IF;

IF j ? 'q' THEN
ft_query := q_to_tsquery(j->>'q');
where_segments := where_segments || format(
$quote$
(
to_tsvector('english', content->'properties'->>'description') ||
to_tsvector('english', content->'properties'->>'title') ||
to_tsvector('english', content->'properties'->'keywords')
) @@ %L
$quote$,
ft_query
);
END IF;

geom := stac_geom(j);
IF geom IS NOT NULL THEN
where_segments := where_segments || format('st_intersects(geometry, %L)',geom);
Expand Down
2 changes: 1 addition & 1 deletion src/pgstac/sql/999_version.sql
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SELECT set_version('0.9.1');
SELECT set_version('unreleased');
92 changes: 92 additions & 0 deletions src/pgstac/tests/basic/free_text.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@

SET pgstac.context TO 'on';
SET pgstac."default_filter_lang" TO 'cql2-json';

CREATE TEMP TABLE temp_collections (
id SERIAL PRIMARY KEY,
title TEXT,
description TEXT,
keywords TEXT,
minx NUMERIC,
miny NUMERIC,
maxx NUMERIC,
maxy NUMERIC,
sdt TIMESTAMPTZ,
edt TIMESTAMPTZ
);

INSERT INTO temp_collections (
title, description, keywords, minx, miny, maxx, maxy, sdt, edt
) VALUES
(
'Stranger Things',
'Some teenagers drop out of school to fight monsters',
'monster, scary, dark, 80s',
-180, -90, 180, 90,
'2016-01-01T00:00:00Z',
'2025-12-31T23:59:59Z'
),
(
'The Bear',
'Another story about why you should not start a restaurant',
'restaurant, funny, sad, great',
-180, -90, 180, 90,
'2022-01-01T00:00:00Z',
'2025-12-31T23:59:59Z'
),
(
'Godzilla',
'A large lizard takes its revenge',
'scary, lizard, monster',
-180, -90, 180, 90,
'1954-01-01T00:00:00Z',
null
),
(
'Chefs Table',
'Another great story that make you wonder if you should go to a restaurant',
'restaurant, food, michelin',
-180, -90, 180, 90,
'2019-01-01T00:00:00Z',
'2025-12-31T23:59:59Z'
);

SELECT
create_collection(jsonb_build_object(
'id', format('testcollection_%s', id),
'type', 'Collection',
'title', title,
'description', description,
'extent', jsonb_build_object(
'spatial', jsonb_build_array(jsonb_build_array(minx, miny, maxx, maxy)),
'temporal', jsonb_build_array(jsonb_build_array(sdt, edt))
),
'stac_extensions', jsonb_build_array(),
'keywords', string_to_array(keywords, ', ')
))
FROM temp_collections;

select collection_search('{"q": "monsters"}');

select collection_search('{"q": "lizard"}');

select collection_search('{"q": "scary OR funny"}');

select collection_search('{"q": "(scary AND revenge) OR (funny AND sad)"}');

select collection_search('{"q": "\"great story\""}');

select collection_search('{"q": "monster -school"}');

select collection_search('{"q": "bear or stranger"}');

select collection_search('{"q": "bear OR stranger"}');

select collection_search('{"q": "bear, stranger"}');

select collection_search('{"q": "bear AND stranger"}');

select collection_search('{"q": "bear and stranger"}');

select collection_search('{"q": "\"bear or stranger\""}');

109 changes: 109 additions & 0 deletions src/pgstac/tests/basic/free_text.sql.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
SET pgstac.context TO 'on';
SET
SET pgstac."default_filter_lang" TO 'cql2-json';
SET
CREATE TEMP TABLE temp_collections (
id SERIAL PRIMARY KEY,
title TEXT,
description TEXT,
keywords TEXT,
minx NUMERIC,
miny NUMERIC,
maxx NUMERIC,
maxy NUMERIC,
sdt TIMESTAMPTZ,
edt TIMESTAMPTZ
);
CREATE TABLE
INSERT INTO temp_collections (
title, description, keywords, minx, miny, maxx, maxy, sdt, edt
) VALUES
(
'Stranger Things',
'Some teenagers drop out of school to fight monsters',
'monster, scary, dark, 80s',
-180, -90, 180, 90,
'2016-01-01T00:00:00Z',
'2025-12-31T23:59:59Z'
),
(
'The Bear',
'Another story about why you should not start a restaurant',
'restaurant, funny, sad, great',
-180, -90, 180, 90,
'2022-01-01T00:00:00Z',
'2025-12-31T23:59:59Z'
),
(
'Godzilla',
'A large lizard takes its revenge',
'scary, lizard, monster',
-180, -90, 180, 90,
'1954-01-01T00:00:00Z',
null
),
(
'Chefs Table',
'Another great story that make you wonder if you should go to a restaurant',
'restaurant, food, michelin',
-180, -90, 180, 90,
'2019-01-01T00:00:00Z',
'2025-12-31T23:59:59Z'
);
INSERT 0 4
SELECT
create_collection(jsonb_build_object(
'id', format('testcollection_%s', id),
'type', 'Collection',
'title', title,
'description', description,
'extent', jsonb_build_object(
'spatial', jsonb_build_array(jsonb_build_array(minx, miny, maxx, maxy)),
'temporal', jsonb_build_array(jsonb_build_array(sdt, edt))
),
'stac_extensions', jsonb_build_array(),
'keywords', string_to_array(keywords, ', ')
))
FROM temp_collections;




select collection_search('{"q": "monsters"}');
{"links": [], "collections": [{"id": "testcollection_1", "type": "Collection", "title": "Stranger Things", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2016-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["monster", "scary", "dark", "80s"], "description": "Some teenagers drop out of school to fight monsters", "stac_extensions": []}, {"id": "testcollection_3", "type": "Collection", "title": "Godzilla", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["1954-01-01T00:00:00+00:00", null]]}, "keywords": ["scary", "lizard", "monster"], "description": "A large lizard takes its revenge", "stac_extensions": []}], "numberMatched": 2, "numberReturned": 2}


select collection_search('{"q": "lizard"}');
{"links": [], "collections": [{"id": "testcollection_3", "type": "Collection", "title": "Godzilla", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["1954-01-01T00:00:00+00:00", null]]}, "keywords": ["scary", "lizard", "monster"], "description": "A large lizard takes its revenge", "stac_extensions": []}], "numberMatched": 1, "numberReturned": 1}


select collection_search('{"q": "scary OR funny"}');
{"links": [], "collections": [{"id": "testcollection_1", "type": "Collection", "title": "Stranger Things", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2016-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["monster", "scary", "dark", "80s"], "description": "Some teenagers drop out of school to fight monsters", "stac_extensions": []}, {"id": "testcollection_2", "type": "Collection", "title": "The Bear", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2022-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["restaurant", "funny", "sad", "great"], "description": "Another story about why you should not start a restaurant", "stac_extensions": []}, {"id": "testcollection_3", "type": "Collection", "title": "Godzilla", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["1954-01-01T00:00:00+00:00", null]]}, "keywords": ["scary", "lizard", "monster"], "description": "A large lizard takes its revenge", "stac_extensions": []}], "numberMatched": 3, "numberReturned": 3}

select collection_search('{"q": "(scary AND revenge) OR (funny AND sad)"}');
{"links": [], "collections": [{"id": "testcollection_2", "type": "Collection", "title": "The Bear", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2022-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["restaurant", "funny", "sad", "great"], "description": "Another story about why you should not start a restaurant", "stac_extensions": []}, {"id": "testcollection_3", "type": "Collection", "title": "Godzilla", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["1954-01-01T00:00:00+00:00", null]]}, "keywords": ["scary", "lizard", "monster"], "description": "A large lizard takes its revenge", "stac_extensions": []}], "numberMatched": 2, "numberReturned": 2}

select collection_search('{"q": "\"great story\""}');
{"links": [], "collections": [{"id": "testcollection_4", "type": "Collection", "title": "Chefs Table", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2019-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["restaurant", "food", "michelin"], "description": "Another great story that make you wonder if you should go to a restaurant", "stac_extensions": []}], "numberMatched": 1, "numberReturned": 1}

select collection_search('{"q": "monster -school"}');
{"links": [], "collections": [{"id": "testcollection_3", "type": "Collection", "title": "Godzilla", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["1954-01-01T00:00:00+00:00", null]]}, "keywords": ["scary", "lizard", "monster"], "description": "A large lizard takes its revenge", "stac_extensions": []}], "numberMatched": 1, "numberReturned": 1}

select collection_search('{"q": "bear or stranger"}');
{"links": [], "collections": [{"id": "testcollection_1", "type": "Collection", "title": "Stranger Things", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2016-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["monster", "scary", "dark", "80s"], "description": "Some teenagers drop out of school to fight monsters", "stac_extensions": []}, {"id": "testcollection_2", "type": "Collection", "title": "The Bear", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2022-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["restaurant", "funny", "sad", "great"], "description": "Another story about why you should not start a restaurant", "stac_extensions": []}], "numberMatched": 2, "numberReturned": 2}

select collection_search('{"q": "bear OR stranger"}');
{"links": [], "collections": [{"id": "testcollection_1", "type": "Collection", "title": "Stranger Things", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2016-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["monster", "scary", "dark", "80s"], "description": "Some teenagers drop out of school to fight monsters", "stac_extensions": []}, {"id": "testcollection_2", "type": "Collection", "title": "The Bear", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2022-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["restaurant", "funny", "sad", "great"], "description": "Another story about why you should not start a restaurant", "stac_extensions": []}], "numberMatched": 2, "numberReturned": 2}

select collection_search('{"q": "bear, stranger"}');
{"links": [], "collections": [{"id": "testcollection_1", "type": "Collection", "title": "Stranger Things", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2016-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["monster", "scary", "dark", "80s"], "description": "Some teenagers drop out of school to fight monsters", "stac_extensions": []}, {"id": "testcollection_2", "type": "Collection", "title": "The Bear", "extent": {"spatial": [[-180, -90, 180, 90]], "temporal": [["2022-01-01T00:00:00+00:00", "2025-12-31T23:59:59+00:00"]]}, "keywords": ["restaurant", "funny", "sad", "great"], "description": "Another story about why you should not start a restaurant", "stac_extensions": []}], "numberMatched": 2, "numberReturned": 2}

select collection_search('{"q": "bear AND stranger"}');
{"links": [], "collections": [], "numberMatched": 0, "numberReturned": 0}

select collection_search('{"q": "bear and stranger"}');
{"links": [], "collections": [], "numberMatched": 0, "numberReturned": 0}

select collection_search('{"q": "\"bear or stranger\""}');
{"links": [], "collections": [], "numberMatched": 0, "numberReturned": 0}

4 changes: 2 additions & 2 deletions src/pypgstac/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "pypgstac"
version = "0.9.1"
version = "0.9.1-dev"
description = "Schema, functions and a python library for storing and accessing STAC collections and items in PostgreSQL"
readme = "README.md"
requires-python = ">=3.8"
Expand Down Expand Up @@ -46,7 +46,7 @@ dev = [
"black>=21.7b0",
"mypy>=0.910",
"types-orjson==0.1.1",
"types-pkg-resources",
"types-setuptools",
"ruff==0.0.231",
"pre-commit",
]
Expand Down
2 changes: 1 addition & 1 deletion src/pypgstac/python/pypgstac/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Version."""
__version__ = "0.9.1"
__version__ = "0.9.1-dev"

0 comments on commit 2f00dbe

Please sign in to comment.