UMNLibraries · Kat-Alo · May 21, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,5 @@
+AWS_ACCESS_KEY_ID= # get from an account admin
+AWS_SECRET_ACCESS_KEY= # get from an account admin
+
+ELASTICSEARCH_PASSWORD= # get from terminal output after installing elasticsearch (see docs/installation.rst)
+ELASTICSEARCH_API_KEY= # get from terminal output after installing elasticsearch (see docs/installation.rst)
diff --git a/.gitignore b/.gitignore
@@ -135,3 +135,5 @@ dmypy.json
 racial_covenants_processor/data/*
 racial_covenants_processor/staticfiles/
 .DS_Store
+.env
+elastic-start-local/
diff --git a/Pipfile b/Pipfile
@@ -44,6 +44,7 @@ django-compressor = "*"
 typing-extensions = "*"
 django-libsass = "*"
 django-cotton = "*"
+django-elasticsearch-dsl = "*"
 
 [dev-packages]
 csvkit = "*"

diff --git a/apps/deed/documents.py b/apps/deed/documents.py
@@ -0,0 +1,80 @@
+from django_elasticsearch_dsl import Document, fields
+from django_elasticsearch_dsl.registries import registry
+
+from apps.deed.models import DeedPage
+
+
+@registry.register_document
+class DeedPageDocument(Document):
+    # Identifier fields - use KeywordField for exact matches and filtering
+    s3_lookup = fields.KeywordField()
+    doc_num = fields.KeywordField()
+    doc_alt_id = fields.KeywordField()
+    book_id = fields.KeywordField()
+    public_uuid = fields.KeywordField()
+
+    # Numeric field
+    page_num = fields.IntegerField()
+
+    # Boolean field for filtering
+    bool_match = fields.BooleanField()
+
+    # Text field for full-text search
+    doc_type = fields.TextField()
+
+    workflow = fields.NestedField(properties={
+        'zoon_id': fields.IntegerField(),
+        'workflow_name': fields.TextField(),
+    })
+
+    matched_terms = fields.NestedField(properties={
+        'term': fields.TextField(),
+    })
+
+    def prepare_workflow(self, instance):
+        """Extract workflow data, using the select_related optimization."""
+        if instance.workflow:
+            return {
+                'zoon_id': instance.workflow.zoon_id,
+                'workflow_name': instance.workflow.workflow_name,
+            }
+        return None
+
+    def prepare_matched_terms(self, instance):
+        """Extract matched terms, using the prefetch_related optimization."""
+        # This uses the prefetched data from get_queryset()
+        return [{'term': term.term} for term in instance.matched_terms.all()]
+
+    class Index:
+        name = "deed_pages"
+    class Django:
+        model = DeedPage
+        # All fields are explicitly defined as class attributes above
+
+        # Assuming that during batch indexing, we do not need to make documents available immediately.
+        # Auto refresh slows down batch indexing.
+        auto_refresh = False
+
+        # The number of objects to query in a batch and process during indexing
+        # We might have to play with this value to balance indexing time and memory usage
+        # (larger value means fewer queries, but more memory usage)
+        queryset_pagination = 500
+
+        def get_queryset(self):
+            """Optimize queryset to avoid N+1 queries during indexing."""
+            return (
+                self.model.objects
+                .select_related('workflow')  # Optimize ForeignKey access
+                .prefetch_related('matched_terms')  # Optimize ManyToMany access
+                .only(
+                    'workflow',
+                    's3_lookup',
+                    'doc_num',
+                    'doc_alt_id',
+                    'book_id',
+                    'page_num',
+                    'doc_type',
+                    'public_uuid',
+                    'bool_match',
+                )
+            )
diff --git a/apps/deed/serializers.py b/apps/deed/serializers.py
@@ -0,0 +1,26 @@
+from rest_framework import serializers
+
+from .models import DeedPage
+from apps.zoon.models import ZooniverseWorkflow
+
+
+class WorkflowSerializer(serializers.ModelSerializer):
+    """Nested serializer for workflow details"""
+    class Meta:
+        model = ZooniverseWorkflow
+        fields = ['id', 'zoon_id', 'workflow_name', 'version', 'slug']
+
+
+# Serializers define the API representation.
+class DeedPageSerializer(serializers.ModelSerializer):
+    workflow = WorkflowSerializer(read_only=True)
+
+    class Meta:
+        model = DeedPage
+        fields = [
+            's3_lookup',
+            'thumbnail_preview',
+            'workflow',
+            'record_link',
+            'bool_match',
+        ]
diff --git a/apps/deed/views.py b/apps/deed/views.py
@@ -1,41 +1,186 @@
+import copy
+from abc import abstractmethod
+
+from elasticsearch_dsl import Document, Q
+from rest_framework.decorators import action
+from rest_framework.pagination import LimitOffsetPagination
+from rest_framework.request import Request
+from rest_framework.response import Response
+from rest_framework.viewsets import ModelViewSet
 from django.shortcuts import render
-from django import forms
+from django.db.models import Case, When
 
-from haystack.query import SearchQuerySet
-from haystack.generic_views import SearchView
-from haystack.forms import SearchForm
+from .documents import DeedPageDocument
+from .serializers import DeedPageSerializer
+from .models import DeedPage
 
-from apps.zoon.models import ZooniverseWorkflow
+class PaginatedElasticSearchAPIView(ModelViewSet, LimitOffsetPagination):
+    document_class: Document = None
 
+    @abstractmethod
+    def generate_search_query(self, search_terms_list, param_filters):
+        """This method should be overridden
+        and return a Q() expression."""
 
-class DeedSearchForm(SearchForm):
-    # bool_match = forms.BooleanField(required=False, widget=forms.CheckboxInput(attrs={'class':'col-12-small'}))
-    bool_match = forms.BooleanField(required=False)
-    # end_date = forms.DateField(required=False)
-    workflow = forms.ModelChoiceField(queryset=ZooniverseWorkflow.objects.all(), to_field_name="workflow_name", required=False)
+    @action(methods=["GET"], detail=False, url_path="search")
+    def search(self, request: Request):
+        try:
+            params = copy.deepcopy(request.query_params)
 
-    def search(self):
-        # First, store the SearchQuerySet received from other processing.
-        sqs = super().search()
+            raw_search = params.pop("search", None)
+            search_terms = raw_search if isinstance(raw_search, list) else [raw_search] if raw_search else None
 
-        if not self.is_valid():
-            return self.no_query_found()
-
-        if self.cleaned_data['workflow']:
-            sqs = sqs.filter(workflow=self.cleaned_data['workflow'])
+            query = self.generate_search_query(
+                search_terms_list=search_terms, param_filters=params
+            )
+
+            search = self.document_class.search().query(query)
+
+            # Apply filters from param_filters
+            bool_match = params.get("bool_match")
+            if bool_match == "true":
+                search = search.filter("term", bool_match=True)
+
+            # Get pagination bounds
+            limit = int(request.query_params.get("limit", 10))
+            offset = int(request.query_params.get("offset", 0))
+
+            # Slice the Search object BEFORE execution
+            paginated_search = search[offset:offset + limit]
+
+            # Execute the paginated search
+            response = paginated_search.execute()
+
+            total = (
+                response.hits.total.value
+                if hasattr(response.hits.total, "value")
+                else response.hits.total
+            )
+
+            # --------------------
+            # STEP 1: Extract IDs
+            # --------------------
+            ids = [hit.meta.id for hit in response.hits]
+
+            if not ids:
+                return Response({
+                    "count": 0,
+                    "results": [],
+                })
+
+            # --------------------
+            # STEP 2: Hydrate from DB
+            # --------------------
+            qs = DeedPage.objects.filter(id__in=ids).select_related('workflow')
+
+            # Preserve ES ordering
+            order = Case(
+                *[When(id=pk, then=pos) for pos, pk in enumerate(ids)]
+            )
+            qs = qs.order_by(order)
+
+            # --------------------
+            # STEP 3: Serialize
+            # --------------------
+            serializer = self.serializer_class(qs, many=True)
+
+            next_offset = offset + limit if offset + limit < total else None
+            previous_offset = offset - limit if offset - limit >= 0 else None
 
-        if self.cleaned_data['bool_match']:
-            sqs = sqs.filter(bool_match=self.cleaned_data['bool_match'])
+            return Response({
+                "count": total,
+                "limit": limit,
+                "offset": offset,
+                "next": next_offset,
+                "previous": previous_offset,
+                "results": serializer.data,
+            })
 
-        return sqs
+        except Exception as e:
+            return Response({"error": str(e)}, status=500)
 
 
-class DeedSearchView(SearchView):
-    template_name = 'search/search.html'
-    # queryset = SearchQuerySet().all()
-    form_class = DeedSearchForm
+class DeedPageViewSet(PaginatedElasticSearchAPIView):
+    serializer_class = DeedPageSerializer
+    queryset = DeedPage.objects.all()
+    document_class = DeedPageDocument
 
-    def get_context_data(self, **kwargs):
-        data = super().get_context_data(**kwargs)
-        data['all_workflows'] = ZooniverseWorkflow.objects.all()
-        return data
+    def generate_search_query(self, search_terms_list: list[str], param_filters: dict):
+        if search_terms_list is None:
+            return Q("match_all")
+
+        search_terms = search_terms_list[0].replace("\x00", "")
+        search_terms = search_terms.replace(",", " ")
+
+        # Check if search term is numeric
+        is_numeric = search_terms.strip().isdigit()
+
+        # Separate fields by type
+        keyword_fields = [
+            "s3_lookup",
+            "doc_num",
+            "doc_alt_id",
+            "book_id",
+            "public_uuid",
+        ]
+
+        text_fields = [
+            "doc_type",
+        ]
+
+        numeric_fields = [
+            "page_num",
+        ]
+
+        # Build queries for different field types
+        queries = []
+
+        # Text field query (only doc_type)
+        if text_fields:
+            queries.append(Q(
+                "multi_match",
+                query=search_terms,
+                fields=text_fields,
+                fuzziness="auto",
+                minimum_should_match="70%"
+            ))
+
+        # Keyword field queries (wildcard for partial matches)
+        keyword_queries = [
+            Q("wildcard", **{field: f"*{search_terms.lower()}*"}) 
+            for field in keyword_fields
+        ]
+        if keyword_queries:
+            queries.append(Q("bool", should=keyword_queries, minimum_should_match=1))
+
+        # Nested field queries
+        # Workflow nested query
+        queries.append(Q(
+            "nested",
+            path="workflow",
+            query=Q("match", **{"workflow.workflow_name": search_terms})
+        ))
+
+        # Matched terms nested query
+        queries.append(Q(
+            "nested",
+            path="matched_terms",
+            query=Q("match", **{"matched_terms.term": search_terms})
+        ))
+
+        # Numeric field query (if numeric search)
+        if is_numeric:
+            queries.append(Q(
+                "multi_match",
+                query=search_terms,
+                fields=numeric_fields
+            ))
+
+        # Combine all queries with OR
+        if queries:
+            query = queries[0]
+            for q in queries[1:]:
+                query = query | q
+            return query
+
+        return Q("match_all")