Skip to content

Commit

Permalink
DR-1951: Remove hardcoded hashmap from SearchService (#998)
Browse files Browse the repository at this point in the history
* Remove hardcoded hashmap from SearchService

* Add imports to fix SpotlessCheck

* Add updated SQL query
  • Loading branch information
fboulnois authored Jul 16, 2021
1 parent 9bd7ad9 commit 471ac49
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 69 deletions.
38 changes: 19 additions & 19 deletions ops/query.sql
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
WITH links AS
(SELECT project.datarepo_row_id AS project_datarepo_row_id, links.datarepo_row_id AS links_datarepo_row_id, links_id, links.project_id, project.content AS project_content, links.content AS links_content
FROM
`broad-jade-dev-data.hca_dev_20210513_search_api_snapshot.project` AS project,
`broad-jade-dev-data.hca_dev_20210513_search_api_snapshot.links` AS links
`broad-jade-dev-data.hca_dev_20210614_secret_search_api_snapshot.project` AS project,
`broad-jade-dev-data.hca_dev_20210614_secret_search_api_snapshot.links` AS links
WHERE JSON_VALUE(project.content, '$.project_core.project_short_name') = 'PulmonaryFibrosisGSE135893'
AND project.project_id = links.project_id)
SELECT
GENERATE_UUID() uuid,
links.project_datarepo_row_id AS project_datarepo_row_id,
links.project_id AS project_id,
JSON_EXTRACT_SCALAR(project_content, '$.project_core.project_short_name') AS project_short_name,
JSON_EXTRACT_SCALAR(project_content, '$.project_core.project_title') AS project_title,
JSON_EXTRACT_SCALAR(project_content, '$.project_core.project_description') AS project_description,
JSON_EXTRACT_SCALAR(project_content, '$.project_core.project_short_name') AS tim__rdfsc__label,
JSON_EXTRACT_SCALAR(project_content, '$.project_core.project_title') AS tim__dctc__title,
JSON_EXTRACT_SCALAR(project_content, '$.project_core.project_description') AS tim__dctc__description,
links.links_datarepo_row_id AS links_datarepo_row_id,
links.links_id AS links_id,
JSON_EXTRACT_SCALAR(link, '$.link_type') AS link_type,
Expand All @@ -26,29 +26,29 @@ JSON_EXTRACT_SCALAR(link, '$.entity.entity_id') AS entity_id,
JSON_EXTRACT_SCALAR(files, '$.file_type') AS file_type,
JSON_EXTRACT_SCALAR(files, '$.file_id') AS file_id,
cell_suspension_id AS cell_suspension_id,
JSON_EXTRACT_SCALAR(cell_suspension.content, '$.biomaterial_core.biomaterial_id') AS biosample_id,
JSON_EXTRACT_SCALAR(cell_suspension.content, '$.selected_cell_type') AS cell_type,
JSON_EXTRACT_SCALAR(donor_organism.content, '$.biomaterial_core.biomaterial_id') AS donor_id,
JSON_EXTRACT_SCALAR(donor_organism.content, '$.sex') AS sex,
JSON_EXTRACT_SCALAR(cell_suspension.content, '$.biomaterial_core.biomaterial_id') AS tim__dctc__identifier,
JSON_EXTRACT_SCALAR(cell_suspension.content, '$.selected_cell_type') AS tim__a__terraa__corec__hasa__selecteda__cella__type,
JSON_EXTRACT_SCALAR(donor_organism.content, '$.biomaterial_core.biomaterial_id') AS tim__provc__wasa__deriveda__from,
JSON_EXTRACT_SCALAR(donor_organism.content, '$.sex') AS tim__a__terraa__corec__hasa__sex,
JSON_EXTRACT_SCALAR(donor_organism.content, '$.organism_age') AS organism_age,
JSON_EXTRACT_SCALAR(donor_organism.content, '$.organism_age_unit.text') AS organism_age_unit,
JSON_EXTRACT_SCALAR(donor_organism_species, '$.text') AS genus_species,
JSON_EXTRACT_SCALAR(specimen_from_organism.content, '$.organ.text') AS organ,
JSON_EXTRACT_SCALAR(diseases, '$.text') AS disease,
JSON_EXTRACT_SCALAR(library_preparation_protocol.content, '$.library_construction_method.text') AS library_construction_method_text,
JSON_EXTRACT_SCALAR(donor_organism.content, '$.organism_age_unit.text') AS tim__a__terraa__corec__hasa__agea__unit,
JSON_EXTRACT_SCALAR(donor_organism_species, '$.text') AS tim__a__terraa__corec__hasa__organisma__type,
JSON_EXTRACT_SCALAR(specimen_from_organism.content, '$.organ.text') AS tim__a__terraa__corec__hasa__anatomicala__site,
JSON_EXTRACT_SCALAR(diseases, '$.text') AS tim__a__terraa__corec__hasa__disease,
JSON_EXTRACT_SCALAR(library_preparation_protocol.content, '$.library_construction_method.text') AS tim__a__terraa__corec__hasa__librarya__prep,
JSON_EXTRACT_SCALAR(library_preparation_protocol.content, '$.library_construction_method.ontology') AS library_construction_method_ontology,
JSON_EXTRACT_SCALAR(library_preparation_protocol.content, '$.library_construction_method.ontology_label') AS library_construction_method_ontology_label
FROM links
LEFT JOIN UNNEST(JSON_EXTRACT_ARRAY(links.links_content, '$.links')) AS link
LEFT JOIN UNNEST(JSON_EXTRACT_ARRAY(link, '$.inputs')) AS input
LEFT JOIN UNNEST(JSON_EXTRACT_ARRAY(link, '$.outputs')) AS output
LEFT JOIN UNNEST(JSON_EXTRACT_ARRAY(link, '$.files')) AS files
LEFT JOIN `broad-jade-dev-data.hca_dev_20210513_search_api_snapshot.cell_suspension` AS cell_suspension ON JSON_EXTRACT_SCALAR(input, '$.input_id') = cell_suspension_id
LEFT JOIN `broad-jade-dev-data.hca_dev_20210513_search_api_snapshot.donor_organism` AS donor_organism ON JSON_EXTRACT_SCALAR(input, '$.input_id') = donor_organism_id
LEFT JOIN `broad-jade-dev-data.hca_dev_20210614_secret_search_api_snapshot.cell_suspension` AS cell_suspension ON JSON_EXTRACT_SCALAR(input, '$.input_id') = cell_suspension_id
LEFT JOIN `broad-jade-dev-data.hca_dev_20210614_secret_search_api_snapshot.donor_organism` AS donor_organism ON JSON_EXTRACT_SCALAR(input, '$.input_id') = donor_organism_id
LEFT JOIN UNNEST(JSON_EXTRACT_ARRAY(donor_organism.content, '$.genus_species')) AS donor_organism_species
LEFT JOIN `broad-jade-dev-data.hca_dev_20210513_search_api_snapshot.specimen_from_organism` AS specimen_from_organism ON JSON_EXTRACT_SCALAR(input, '$.input_id') = specimen_from_organism_id
LEFT JOIN `broad-jade-dev-data.hca_dev_20210614_secret_search_api_snapshot.specimen_from_organism` AS specimen_from_organism ON JSON_EXTRACT_SCALAR(input, '$.input_id') = specimen_from_organism_id
LEFT JOIN UNNEST(JSON_EXTRACT_ARRAY(specimen_from_organism.content, '$.diseases')) AS diseases
LEFT JOIN UNNEST(JSON_EXTRACT_ARRAY(link, '$.protocols')) AS protocols
LEFT JOIN `broad-jade-dev-data.hca_dev_20210513_search_api_snapshot.library_preparation_protocol` AS library_preparation_protocol ON JSON_EXTRACT_SCALAR(protocols, '$.protocol_id') = library_preparation_protocol_id
LEFT JOIN `broad-jade-dev-data.hca_dev_20210513_search_api_snapshot.sequence_file` AS sequence_file ON JSON_EXTRACT_SCALAR(output, '$.output_id') = sequence_file_id
LEFT JOIN `broad-jade-dev-data.hca_dev_20210614_secret_search_api_snapshot.library_preparation_protocol` AS library_preparation_protocol ON JSON_EXTRACT_SCALAR(protocols, '$.protocol_id') = library_preparation_protocol_id
LEFT JOIN `broad-jade-dev-data.hca_dev_20210614_secret_search_api_snapshot.sequence_file` AS sequence_file ON JSON_EXTRACT_SCALAR(output, '$.output_id') = sequence_file_id
;
21 changes: 5 additions & 16 deletions src/main/java/bio/terra/app/utils/TimUtils.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package bio.terra.app.utils;

import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -50,19 +48,14 @@ public static String encode(String s) {
* Encode the AS columns of a SQL query into corresponding TIM property names
*
* @param sql the SQL string containing AS columns to encode
* @param columnReplacements a map which stores column names as keys and TIM property names as
* values
* @return a modified SQL string containing encoded TIM property names
*/
public static String encodeSqlColumns(String sql, Map<String, String> columnReplacements) {
Pattern regex = Pattern.compile("( [aA][sS] )(\\w+)");
public static String encodeSqlColumns(String sql) {
Pattern regex = Pattern.compile("( [aA][sS] )\\[([\\w.:]+)\\]");
Matcher matches = regex.matcher(sql);
StringBuilder sb = new StringBuilder(sql.length());
while (matches.find()) {
String replacement = columnReplacements.get(matches.group(2));
if (replacement != null) {
matches.appendReplacement(sb, matches.group(1) + TimUtils.encode(replacement));
}
matches.appendReplacement(sb, matches.group(1) + TimUtils.encode(matches.group(2)));
}
matches.appendTail(sb);
return sb.toString();
Expand All @@ -72,18 +65,14 @@ public static String encodeSqlColumns(String sql, Map<String, String> columnRepl
* Replace the TIM fields of a search query with encoded property names.
*
* @param query the search query containing TIM fields to replace
* @param properties a set of TIM property names to validate against
* @return a modified search query containing encoded TIM property names
*/
public static String encodeQueryFields(String query, Set<String> properties) {
public static String encodeQueryFields(String query) {
Pattern regex = Pattern.compile("\\[([\\w.:]+)\\]");
Matcher matches = regex.matcher(query);
StringBuilder sb = new StringBuilder(query.length());
while (matches.find()) {
String match = matches.group(1);
if (properties.contains(match)) {
matches.appendReplacement(sb, TimUtils.encode(match));
}
matches.appendReplacement(sb, TimUtils.encode(matches.group(1)));
}
matches.appendTail(sb);
return sb.toString();
Expand Down
24 changes: 2 additions & 22 deletions src/main/java/bio/terra/service/search/SearchService.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@
import bio.terra.service.search.exception.SearchException;
import bio.terra.service.snapshot.Snapshot;
import bio.terra.service.tabulardata.google.BigQueryPdao;
import com.google.common.collect.ImmutableMap;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -50,22 +48,6 @@ public class SearchService {
private final BigQueryPdao bigQueryPdao;
private final RestHighLevelClient client;

private static final Map<String, String> columnReplacements =
new ImmutableMap.Builder<String, String>()
.put("biosample_id", "dct:identifier")
.put("donor_id", "prov:wasDerivedFrom")
.put("disease", "TerraCore:hasDisease")
.put("genus_species", "TerraCore:hasOrganismType")
.put("organ", "TerraCore:hasAnatomicalSite")
.put("library_construction_method_text", "TerraCore:hasLibraryPrep")
.put("sex", "TerraCore:hasSex")
.put("project_title", "dct:title")
.put("project_description", "dct:description")
.put("project_short_name", "rdfs:label")
.put("cell_type", "TerraCore:hasSelectedCellType")
.put("organism_age_unit", "TerraCore:hasAgeUnit")
.build();

@Value("${elasticsearch.numShards}")
private int NUM_SHARDS;

Expand Down Expand Up @@ -148,7 +130,7 @@ private SearchIndexModel getIndexSummary(String indexName) {
public SearchIndexModel indexSnapshot(Snapshot snapshot, SearchIndexRequest searchIndexRequest)
throws InterruptedException {

String sql = TimUtils.encodeSqlColumns(searchIndexRequest.getSql(), columnReplacements);
String sql = TimUtils.encodeSqlColumns(searchIndexRequest.getSql());
List<Map<String, Object>> values = bigQueryPdao.getSnapshotTableData(snapshot, sql);
validateSnapshotDataNotEmpty(values);
String indexName = createEmptyIndex(snapshot);
Expand Down Expand Up @@ -177,9 +159,7 @@ public SearchQueryResultModel querySnapshot(
searchSourceBuilder.size(limit);
// see
// https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-wrapper-query.html
String query =
TimUtils.encodeQueryFields(
searchQueryRequest.getQuery(), new HashSet<>(columnReplacements.values()));
String query = TimUtils.encodeQueryFields(searchQueryRequest.getQuery());
WrapperQueryBuilder wrapperQuery = QueryBuilders.wrapperQuery(query);
searchSourceBuilder.query(wrapperQuery);

Expand Down
20 changes: 8 additions & 12 deletions src/test/java/bio/terra/service/search/SearchServiceTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,9 @@
import bio.terra.service.snapshot.Snapshot;
import bio.terra.service.snapshot.SnapshotTable;
import bio.terra.service.tabulardata.google.BigQueryPdao;
import com.google.common.collect.ImmutableMap;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -48,19 +46,18 @@
@RunWith(MockitoJUnitRunner.StrictStubs.class)
@Category(Unit.class)
public class SearchServiceTest {
private static final String sqlQuery =
"SELECT GENERATE_UUID() uuid, CURRENT_TIMESTAMP() AS example_now"
+ " FROM UNNEST(GENERATE_ARRAY(1, 3));";

private static final String timPropertyName = "example:identifier.now";
private static final String timEncodedName = TimUtils.encode(timPropertyName);

private static final String sqlQuery =
String.format(
"SELECT GENERATE_UUID() uuid, CURRENT_TIMESTAMP() AS [%s]"
+ " FROM UNNEST(GENERATE_ARRAY(1, 3));",
timPropertyName);

private static final String searchQuery =
String.format("{\"query_string\": {\"query\": \"([%s]:0)\"}}", timPropertyName);

private static final Map<String, String> columnReplacements =
new ImmutableMap.Builder<String, String>().put("example_now", timPropertyName).build();

private static final String indexName = "idx-mock";

@Mock private BigQueryPdao bigQueryPdao;
Expand Down Expand Up @@ -93,16 +90,15 @@ public void timColumnEncodingTest() {
"SELECT GENERATE_UUID() uuid, CURRENT_TIMESTAMP() AS %s"
+ " FROM UNNEST(GENERATE_ARRAY(1, 3));",
timEncodedName);
String actualSql = TimUtils.encodeSqlColumns(sqlQuery, columnReplacements);
String actualSql = TimUtils.encodeSqlColumns(sqlQuery);
assertEquals(expectedSql, actualSql);
}

@Test
public void timFieldEncodingTest() {
String expectedQuery =
String.format("{\"query_string\": {\"query\": \"(%s:0)\"}}", timEncodedName);
String actualQuery =
TimUtils.encodeQueryFields(searchQuery, new HashSet<>(columnReplacements.values()));
String actualQuery = TimUtils.encodeQueryFields(searchQuery);
assertEquals(expectedQuery, actualQuery);
}

Expand Down

0 comments on commit 471ac49

Please sign in to comment.