Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
AWS_ACCESS_KEY_ID= # get from an account admin
AWS_SECRET_ACCESS_KEY= # get from an account admin

ELASTICSEARCH_PASSWORD= # get from terminal output after installing elasticsearch (see docs/installation.rst)
ELASTICSEARCH_API_KEY= # get from terminal output after installing elasticsearch (see docs/installation.rst)
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,5 @@ dmypy.json
racial_covenants_processor/data/*
racial_covenants_processor/staticfiles/
.DS_Store
.env
elastic-start-local/
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ django-compressor = "*"
typing-extensions = "*"
django-libsass = "*"
django-cotton = "*"
django-elasticsearch-dsl = "*"

[dev-packages]
csvkit = "*"
Expand Down
80 changes: 80 additions & 0 deletions apps/deed/documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry

from apps.deed.models import DeedPage


@registry.register_document
class DeedPageDocument(Document):

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defines the actual elasticsearch index of DeedPage objects

# Identifier fields - use KeywordField for exact matches and filtering
s3_lookup = fields.KeywordField()
doc_num = fields.KeywordField()
doc_alt_id = fields.KeywordField()
book_id = fields.KeywordField()
public_uuid = fields.KeywordField()

# Numeric field
page_num = fields.IntegerField()

# Boolean field for filtering
bool_match = fields.BooleanField()

# Text field for full-text search
doc_type = fields.TextField()

workflow = fields.NestedField(properties={
'zoon_id': fields.IntegerField(),
'workflow_name': fields.TextField(),
})

matched_terms = fields.NestedField(properties={
'term': fields.TextField(),
})

def prepare_workflow(self, instance):
"""Extract workflow data, using the select_related optimization."""
if instance.workflow:
return {
'zoon_id': instance.workflow.zoon_id,
'workflow_name': instance.workflow.workflow_name,
}
return None

def prepare_matched_terms(self, instance):
"""Extract matched terms, using the prefetch_related optimization."""
# This uses the prefetched data from get_queryset()
return [{'term': term.term} for term in instance.matched_terms.all()]

class Index:
name = "deed_pages"
class Django:
model = DeedPage
# All fields are explicitly defined as class attributes above

# Assuming that during batch indexing, we do not need to make documents available immediately.
# Auto refresh slows down batch indexing.
auto_refresh = False

# The number of objects to query in a batch and process during indexing
# We might have to play with this value to balance indexing time and memory usage
# (larger value means fewer queries, but more memory usage)
queryset_pagination = 500

def get_queryset(self):

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is, I think, the biggest performance win from reevaluating the indexing. Basically, we are gathering all the fields, including on other models, at the start so that we don't have to do it for every row we're indexing.

"""Optimize queryset to avoid N+1 queries during indexing."""
return (
self.model.objects
.select_related('workflow') # Optimize ForeignKey access
.prefetch_related('matched_terms') # Optimize ManyToMany access
.only(
'workflow',
's3_lookup',
'doc_num',
'doc_alt_id',
'book_id',
'page_num',
'doc_type',
'public_uuid',
'bool_match',
)
)
26 changes: 26 additions & 0 deletions apps/deed/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from rest_framework import serializers

from .models import DeedPage
from apps.zoon.models import ZooniverseWorkflow


class WorkflowSerializer(serializers.ModelSerializer):
"""Nested serializer for workflow details"""
class Meta:
model = ZooniverseWorkflow
fields = ['id', 'zoon_id', 'workflow_name', 'version', 'slug']


# Serializers define the API representation.
class DeedPageSerializer(serializers.ModelSerializer):
workflow = WorkflowSerializer(read_only=True)

class Meta:
model = DeedPage
fields = [
's3_lookup',
'thumbnail_preview',
'workflow',
'record_link',
'bool_match',
]
203 changes: 174 additions & 29 deletions apps/deed/views.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,186 @@
import copy
from abc import abstractmethod

from elasticsearch_dsl import Document, Q
from rest_framework.decorators import action
from rest_framework.pagination import LimitOffsetPagination
from rest_framework.request import Request
from rest_framework.response import Response
from rest_framework.viewsets import ModelViewSet
from django.shortcuts import render
from django import forms
from django.db.models import Case, When

from haystack.query import SearchQuerySet
from haystack.generic_views import SearchView
from haystack.forms import SearchForm
from .documents import DeedPageDocument
from .serializers import DeedPageSerializer
from .models import DeedPage

from apps.zoon.models import ZooniverseWorkflow
class PaginatedElasticSearchAPIView(ModelViewSet, LimitOffsetPagination):
document_class: Document = None

@abstractmethod
def generate_search_query(self, search_terms_list, param_filters):
"""This method should be overridden
and return a Q() expression."""

class DeedSearchForm(SearchForm):
# bool_match = forms.BooleanField(required=False, widget=forms.CheckboxInput(attrs={'class':'col-12-small'}))
bool_match = forms.BooleanField(required=False)
# end_date = forms.DateField(required=False)
workflow = forms.ModelChoiceField(queryset=ZooniverseWorkflow.objects.all(), to_field_name="workflow_name", required=False)
@action(methods=["GET"], detail=False, url_path="search")
def search(self, request: Request):
try:
params = copy.deepcopy(request.query_params)

def search(self):
# First, store the SearchQuerySet received from other processing.
sqs = super().search()
raw_search = params.pop("search", None)
search_terms = raw_search if isinstance(raw_search, list) else [raw_search] if raw_search else None

if not self.is_valid():
return self.no_query_found()

if self.cleaned_data['workflow']:
sqs = sqs.filter(workflow=self.cleaned_data['workflow'])
query = self.generate_search_query(
search_terms_list=search_terms, param_filters=params
)

search = self.document_class.search().query(query)

# Apply filters from param_filters
bool_match = params.get("bool_match")
if bool_match == "true":
search = search.filter("term", bool_match=True)

# Get pagination bounds
limit = int(request.query_params.get("limit", 10))
offset = int(request.query_params.get("offset", 0))

# Slice the Search object BEFORE execution
paginated_search = search[offset:offset + limit]

# Execute the paginated search
response = paginated_search.execute()

total = (
response.hits.total.value
if hasattr(response.hits.total, "value")
else response.hits.total
)

# --------------------
# STEP 1: Extract IDs
# --------------------
ids = [hit.meta.id for hit in response.hits]

if not ids:
return Response({
"count": 0,
"results": [],
})

# --------------------
# STEP 2: Hydrate from DB
# --------------------
qs = DeedPage.objects.filter(id__in=ids).select_related('workflow')

# Preserve ES ordering
order = Case(
*[When(id=pk, then=pos) for pos, pk in enumerate(ids)]
)
qs = qs.order_by(order)

# --------------------
# STEP 3: Serialize
# --------------------
serializer = self.serializer_class(qs, many=True)

next_offset = offset + limit if offset + limit < total else None
previous_offset = offset - limit if offset - limit >= 0 else None

if self.cleaned_data['bool_match']:
sqs = sqs.filter(bool_match=self.cleaned_data['bool_match'])
return Response({
"count": total,
"limit": limit,
"offset": offset,
"next": next_offset,
"previous": previous_offset,
"results": serializer.data,
})

return sqs
except Exception as e:
return Response({"error": str(e)}, status=500)


class DeedSearchView(SearchView):
template_name = 'search/search.html'
# queryset = SearchQuerySet().all()
form_class = DeedSearchForm
class DeedPageViewSet(PaginatedElasticSearchAPIView):
serializer_class = DeedPageSerializer
queryset = DeedPage.objects.all()
document_class = DeedPageDocument

def get_context_data(self, **kwargs):
data = super().get_context_data(**kwargs)
data['all_workflows'] = ZooniverseWorkflow.objects.all()
return data
def generate_search_query(self, search_terms_list: list[str], param_filters: dict):

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can play with this function if we don't like the number of results returned

if search_terms_list is None:
return Q("match_all")

search_terms = search_terms_list[0].replace("\x00", "")
search_terms = search_terms.replace(",", " ")

# Check if search term is numeric
is_numeric = search_terms.strip().isdigit()

# Separate fields by type
keyword_fields = [
"s3_lookup",
"doc_num",
"doc_alt_id",
"book_id",
"public_uuid",
]

text_fields = [
"doc_type",
]

numeric_fields = [
"page_num",
]

# Build queries for different field types
queries = []

# Text field query (only doc_type)
if text_fields:
queries.append(Q(
"multi_match",
query=search_terms,
fields=text_fields,
fuzziness="auto",
minimum_should_match="70%"
))

# Keyword field queries (wildcard for partial matches)
keyword_queries = [
Q("wildcard", **{field: f"*{search_terms.lower()}*"})
for field in keyword_fields
]
if keyword_queries:
queries.append(Q("bool", should=keyword_queries, minimum_should_match=1))

# Nested field queries
# Workflow nested query
queries.append(Q(
"nested",
path="workflow",
query=Q("match", **{"workflow.workflow_name": search_terms})
))

# Matched terms nested query
queries.append(Q(
"nested",
path="matched_terms",
query=Q("match", **{"matched_terms.term": search_terms})
))

# Numeric field query (if numeric search)
if is_numeric:
queries.append(Q(
"multi_match",
query=search_terms,
fields=numeric_fields
))

# Combine all queries with OR
if queries:
query = queries[0]
for q in queries[1:]:
query = query | q
return query

return Q("match_all")
Loading
Loading