-
Notifications
You must be signed in to change notification settings - Fork 1
Elasticsearch index setup #142
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
d5dc8a7
b0fe093
2d7b47d
81700cd
4e6b1fd
5e5132a
06fea68
e8a3061
9631daf
0e8ef12
e9383ed
65f3780
b156708
617a486
e1df6dd
ae834dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| AWS_ACCESS_KEY_ID= # get from an account admin | ||
| AWS_SECRET_ACCESS_KEY= # get from an account admin | ||
|
|
||
| ELASTICSEARCH_PASSWORD= # get from terminal output after installing elasticsearch (see docs/installation.rst) | ||
| ELASTICSEARCH_API_KEY= # get from terminal output after installing elasticsearch (see docs/installation.rst) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| from django_elasticsearch_dsl import Document, fields | ||
| from django_elasticsearch_dsl.registries import registry | ||
|
|
||
| from apps.deed.models import DeedPage | ||
|
|
||
|
|
||
| @registry.register_document | ||
| class DeedPageDocument(Document): | ||
| # Identifier fields - use KeywordField for exact matches and filtering | ||
| s3_lookup = fields.KeywordField() | ||
| doc_num = fields.KeywordField() | ||
| doc_alt_id = fields.KeywordField() | ||
| book_id = fields.KeywordField() | ||
| public_uuid = fields.KeywordField() | ||
|
|
||
| # Numeric field | ||
| page_num = fields.IntegerField() | ||
|
|
||
| # Boolean field for filtering | ||
| bool_match = fields.BooleanField() | ||
|
|
||
| # Text field for full-text search | ||
| doc_type = fields.TextField() | ||
|
|
||
| workflow = fields.NestedField(properties={ | ||
| 'zoon_id': fields.IntegerField(), | ||
| 'workflow_name': fields.TextField(), | ||
| }) | ||
|
|
||
| matched_terms = fields.NestedField(properties={ | ||
| 'term': fields.TextField(), | ||
| }) | ||
|
|
||
| def prepare_workflow(self, instance): | ||
| """Extract workflow data, using the select_related optimization.""" | ||
| if instance.workflow: | ||
| return { | ||
| 'zoon_id': instance.workflow.zoon_id, | ||
| 'workflow_name': instance.workflow.workflow_name, | ||
| } | ||
| return None | ||
|
|
||
| def prepare_matched_terms(self, instance): | ||
| """Extract matched terms, using the prefetch_related optimization.""" | ||
| # This uses the prefetched data from get_queryset() | ||
| return [{'term': term.term} for term in instance.matched_terms.all()] | ||
|
|
||
| class Index: | ||
| name = "deed_pages" | ||
| class Django: | ||
| model = DeedPage | ||
| # All fields are explicitly defined as class attributes above | ||
|
|
||
| # Assuming that during batch indexing, we do not need to make documents available immediately. | ||
| # Auto refresh slows down batch indexing. | ||
| auto_refresh = False | ||
|
|
||
| # The number of objects to query in a batch and process during indexing | ||
| # We might have to play with this value to balance indexing time and memory usage | ||
| # (larger value means fewer queries, but more memory usage) | ||
| queryset_pagination = 500 | ||
|
|
||
| def get_queryset(self): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is, I think, the biggest performance win from reevaluating the indexing. Basically, we are gathering all the fields, including on other models, at the start so that we don't have to do it for every row we're indexing. |
||
| """Optimize queryset to avoid N+1 queries during indexing.""" | ||
| return ( | ||
| self.model.objects | ||
| .select_related('workflow') # Optimize ForeignKey access | ||
| .prefetch_related('matched_terms') # Optimize ManyToMany access | ||
| .only( | ||
| 'workflow', | ||
| 's3_lookup', | ||
| 'doc_num', | ||
| 'doc_alt_id', | ||
| 'book_id', | ||
| 'page_num', | ||
| 'doc_type', | ||
| 'public_uuid', | ||
| 'bool_match', | ||
| ) | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| from rest_framework import serializers | ||
|
|
||
| from .models import DeedPage | ||
| from apps.zoon.models import ZooniverseWorkflow | ||
|
|
||
|
|
||
| class WorkflowSerializer(serializers.ModelSerializer): | ||
| """Nested serializer for workflow details""" | ||
| class Meta: | ||
| model = ZooniverseWorkflow | ||
| fields = ['id', 'zoon_id', 'workflow_name', 'version', 'slug'] | ||
|
|
||
|
|
||
| # Serializers define the API representation. | ||
| class DeedPageSerializer(serializers.ModelSerializer): | ||
| workflow = WorkflowSerializer(read_only=True) | ||
|
|
||
| class Meta: | ||
| model = DeedPage | ||
| fields = [ | ||
| 's3_lookup', | ||
| 'thumbnail_preview', | ||
| 'workflow', | ||
| 'record_link', | ||
| 'bool_match', | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,41 +1,186 @@ | ||
| import copy | ||
| from abc import abstractmethod | ||
|
|
||
| from elasticsearch_dsl import Document, Q | ||
| from rest_framework.decorators import action | ||
| from rest_framework.pagination import LimitOffsetPagination | ||
| from rest_framework.request import Request | ||
| from rest_framework.response import Response | ||
| from rest_framework.viewsets import ModelViewSet | ||
| from django.shortcuts import render | ||
| from django import forms | ||
| from django.db.models import Case, When | ||
|
|
||
| from haystack.query import SearchQuerySet | ||
| from haystack.generic_views import SearchView | ||
| from haystack.forms import SearchForm | ||
| from .documents import DeedPageDocument | ||
| from .serializers import DeedPageSerializer | ||
| from .models import DeedPage | ||
|
|
||
| from apps.zoon.models import ZooniverseWorkflow | ||
| class PaginatedElasticSearchAPIView(ModelViewSet, LimitOffsetPagination): | ||
| document_class: Document = None | ||
|
|
||
| @abstractmethod | ||
| def generate_search_query(self, search_terms_list, param_filters): | ||
| """This method should be overridden | ||
| and return a Q() expression.""" | ||
|
|
||
| class DeedSearchForm(SearchForm): | ||
| # bool_match = forms.BooleanField(required=False, widget=forms.CheckboxInput(attrs={'class':'col-12-small'})) | ||
| bool_match = forms.BooleanField(required=False) | ||
| # end_date = forms.DateField(required=False) | ||
| workflow = forms.ModelChoiceField(queryset=ZooniverseWorkflow.objects.all(), to_field_name="workflow_name", required=False) | ||
| @action(methods=["GET"], detail=False, url_path="search") | ||
| def search(self, request: Request): | ||
| try: | ||
| params = copy.deepcopy(request.query_params) | ||
|
|
||
| def search(self): | ||
| # First, store the SearchQuerySet received from other processing. | ||
| sqs = super().search() | ||
| raw_search = params.pop("search", None) | ||
| search_terms = raw_search if isinstance(raw_search, list) else [raw_search] if raw_search else None | ||
|
|
||
| if not self.is_valid(): | ||
| return self.no_query_found() | ||
|
|
||
| if self.cleaned_data['workflow']: | ||
| sqs = sqs.filter(workflow=self.cleaned_data['workflow']) | ||
| query = self.generate_search_query( | ||
| search_terms_list=search_terms, param_filters=params | ||
| ) | ||
|
|
||
| search = self.document_class.search().query(query) | ||
|
|
||
| # Apply filters from param_filters | ||
| bool_match = params.get("bool_match") | ||
| if bool_match == "true": | ||
| search = search.filter("term", bool_match=True) | ||
|
|
||
| # Get pagination bounds | ||
| limit = int(request.query_params.get("limit", 10)) | ||
| offset = int(request.query_params.get("offset", 0)) | ||
|
|
||
| # Slice the Search object BEFORE execution | ||
| paginated_search = search[offset:offset + limit] | ||
|
|
||
| # Execute the paginated search | ||
| response = paginated_search.execute() | ||
|
|
||
| total = ( | ||
| response.hits.total.value | ||
| if hasattr(response.hits.total, "value") | ||
| else response.hits.total | ||
| ) | ||
|
|
||
| # -------------------- | ||
| # STEP 1: Extract IDs | ||
| # -------------------- | ||
| ids = [hit.meta.id for hit in response.hits] | ||
|
|
||
| if not ids: | ||
| return Response({ | ||
| "count": 0, | ||
| "results": [], | ||
| }) | ||
|
|
||
| # -------------------- | ||
| # STEP 2: Hydrate from DB | ||
| # -------------------- | ||
| qs = DeedPage.objects.filter(id__in=ids).select_related('workflow') | ||
|
|
||
| # Preserve ES ordering | ||
| order = Case( | ||
| *[When(id=pk, then=pos) for pos, pk in enumerate(ids)] | ||
| ) | ||
| qs = qs.order_by(order) | ||
|
|
||
| # -------------------- | ||
| # STEP 3: Serialize | ||
| # -------------------- | ||
| serializer = self.serializer_class(qs, many=True) | ||
|
|
||
| next_offset = offset + limit if offset + limit < total else None | ||
| previous_offset = offset - limit if offset - limit >= 0 else None | ||
|
|
||
| if self.cleaned_data['bool_match']: | ||
| sqs = sqs.filter(bool_match=self.cleaned_data['bool_match']) | ||
| return Response({ | ||
| "count": total, | ||
| "limit": limit, | ||
| "offset": offset, | ||
| "next": next_offset, | ||
| "previous": previous_offset, | ||
| "results": serializer.data, | ||
| }) | ||
|
|
||
| return sqs | ||
| except Exception as e: | ||
| return Response({"error": str(e)}, status=500) | ||
|
|
||
|
|
||
| class DeedSearchView(SearchView): | ||
| template_name = 'search/search.html' | ||
| # queryset = SearchQuerySet().all() | ||
| form_class = DeedSearchForm | ||
| class DeedPageViewSet(PaginatedElasticSearchAPIView): | ||
| serializer_class = DeedPageSerializer | ||
| queryset = DeedPage.objects.all() | ||
| document_class = DeedPageDocument | ||
|
|
||
| def get_context_data(self, **kwargs): | ||
| data = super().get_context_data(**kwargs) | ||
| data['all_workflows'] = ZooniverseWorkflow.objects.all() | ||
| return data | ||
| def generate_search_query(self, search_terms_list: list[str], param_filters: dict): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can play with this function if we don't like the number of results returned |
||
| if search_terms_list is None: | ||
| return Q("match_all") | ||
|
|
||
| search_terms = search_terms_list[0].replace("\x00", "") | ||
| search_terms = search_terms.replace(",", " ") | ||
|
|
||
| # Check if search term is numeric | ||
| is_numeric = search_terms.strip().isdigit() | ||
|
|
||
| # Separate fields by type | ||
| keyword_fields = [ | ||
| "s3_lookup", | ||
| "doc_num", | ||
| "doc_alt_id", | ||
| "book_id", | ||
| "public_uuid", | ||
| ] | ||
|
|
||
| text_fields = [ | ||
| "doc_type", | ||
| ] | ||
|
|
||
| numeric_fields = [ | ||
| "page_num", | ||
| ] | ||
|
|
||
| # Build queries for different field types | ||
| queries = [] | ||
|
|
||
| # Text field query (only doc_type) | ||
| if text_fields: | ||
| queries.append(Q( | ||
| "multi_match", | ||
| query=search_terms, | ||
| fields=text_fields, | ||
| fuzziness="auto", | ||
| minimum_should_match="70%" | ||
| )) | ||
|
|
||
| # Keyword field queries (wildcard for partial matches) | ||
| keyword_queries = [ | ||
| Q("wildcard", **{field: f"*{search_terms.lower()}*"}) | ||
| for field in keyword_fields | ||
| ] | ||
| if keyword_queries: | ||
| queries.append(Q("bool", should=keyword_queries, minimum_should_match=1)) | ||
|
|
||
| # Nested field queries | ||
| # Workflow nested query | ||
| queries.append(Q( | ||
| "nested", | ||
| path="workflow", | ||
| query=Q("match", **{"workflow.workflow_name": search_terms}) | ||
| )) | ||
|
|
||
| # Matched terms nested query | ||
| queries.append(Q( | ||
| "nested", | ||
| path="matched_terms", | ||
| query=Q("match", **{"matched_terms.term": search_terms}) | ||
| )) | ||
|
|
||
| # Numeric field query (if numeric search) | ||
| if is_numeric: | ||
| queries.append(Q( | ||
| "multi_match", | ||
| query=search_terms, | ||
| fields=numeric_fields | ||
| )) | ||
|
|
||
| # Combine all queries with OR | ||
| if queries: | ||
| query = queries[0] | ||
| for q in queries[1:]: | ||
| query = query | q | ||
| return query | ||
|
|
||
| return Q("match_all") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Defines the actual elasticsearch index of DeedPage objects