Source code for bigdata_client.search

from __future__ import annotations

from dataclasses import dataclass
from typing import Generator, Iterable, Iterator, Optional, Union

from bigdata_client.advanced_search_query import AdvancedSearchQuery
from bigdata_client.api.search import (
    QueryChunksResponse,
    SavedSearchResponse,
    SaveSearchRequest,
    ShareSavedSearchRequest,
    UpdateSearchRequest,
    UserQueryShareCompanyContext,
)
from bigdata_client.connection import BigdataConnection
from bigdata_client.constants import (
    MAX_SEARCH_PAGES,
    PAGE_SIZE_BE_LIMIT,
    SEARCH_PAGE_DEFAULT_SIZE,
)
from bigdata_client.daterange import AbsoluteDateRange, RollingDateRange
from bigdata_client.document import Document
from bigdata_client.exceptions import BigdataClientError
from bigdata_client.models.advanced_search_query import QueryComponent
from bigdata_client.models.comentions import Comentions
from bigdata_client.models.search import (
    DocumentType,
    SearchPaginationByCursor,
    SearchPaginationByOffset,
    SortBy,
)
from bigdata_client.models.sharing import SharePermission






# To be changed. It shouldn't be a dataclass, but for now it's fine
[docs] @dataclass class SearchResults: """ A search with a limit. It allows you to get the count of documents, and/or get an iterator over the results. """
[docs] def __init__(self, search: Search, limit: Union[int, ChunkLimit]): self.search = search self._first_page: Optional[QueryChunksResponse] = None if isinstance(limit, int) and limit <= 0: raise ValueError("The limit must be a positive number.") self._limit = limit
def __iter__(self) -> Iterable[Document]: if isinstance(self._limit, int): return iter( SearchResultsIteratorWithDocuments( self.search, self._limit, self._first_page ) ) elif isinstance(self._limit, ChunkLimit): return iter( SearchResultsIteratorChunks(self.search, self._limit, self._first_page) ) else: raise NotImplementedError( "The limit must be an int or a ChunkLimit object." )
[docs] class SearchResultsIteratorWithDocuments: """ Helper to iterate over the documents in all the pages. Optionally, it can skip the first request and use the first_page parameter. """
[docs] def __init__( self, search: Search, limit: int, first_page: Optional[QueryChunksResponse], ): self.search = search self.current_page = first_page or None self._doc_limit = limit self._page_num = 0
def __iter__(self) -> Iterator[Document]: # The first page may have been provided, if the user asked for the count first if self.current_page is None: self.current_page = self.search._get_query_chunks_page( SearchPaginationByCursor() ) items = 0 for _ in range(MAX_SEARCH_PAGES): # Effectively a while(True), but safer for document in self.current_page.stories: if items >= self._doc_limit: return items += 1 yield Document.from_response(document, api=self.search._api) next_page = ( self.current_page.next_cursor # Double-check, if there are no elements, don't trust next_cursor if self.current_page.stories else None ) if not next_page: break self._page_num = next_page self.current_page = self.search._get_query_chunks_page( SearchPaginationByCursor(cursor=next_page) )
[docs] class SearchResultsIteratorChunks: """ Helper to iterate over the documents in all the pages. Optionally, it can skip the first request and use the first_page parameter. """
[docs] def __init__( self, search: Search, limit: ChunkLimit, first_page: Optional[QueryChunksResponse], ): self.search = search self.current_page = first_page or None self._chunk_limit = limit
def __iter__(self) -> Iterator[Document]: # The first page may have been provided, if the user asked for the count first paginator = self._chunk_limit._get_paginator() if self.current_page is None: self.current_page = self.search._get_query_chunks_page(next(paginator)) for _ in range(MAX_SEARCH_PAGES): # Effectively a while(True), but safer for document in self.current_page.stories: yield Document.from_response(document, api=self.search._api) next_page_exists = self.current_page.chunks_count and ( self.current_page.count > self._chunk_limit.chunks_requested ) if not next_page_exists: break try: self.current_page = self.search._get_query_chunks_page(next(paginator)) except StopIteration: break
[docs] class ChunkLimit: """ Control the pagination by Chunks. """
[docs] def __init__(self, /, limit: int): self._initial_limit = ( limit # Used to make sure users don't reuse the ChunkLimit ) self.limit = limit self.chunks_requested = 0
def _get_paginator(self) -> Generator[SearchPaginationByOffset, None, None]: """ >>> paginator = ChunkLimit(3007)._get_paginator() >>> next(paginator) SearchPaginationByOffset(limit=1000, offset=0) >>> next(paginator) SearchPaginationByOffset(limit=1000, offset=1000) >>> next(paginator) SearchPaginationByOffset(limit=1000, offset=2000) >>> next(paginator) SearchPaginationByOffset(limit=7, offset=3000) """ if self._initial_limit != self.limit: raise BigdataClientError( "The ChunkLimit object has already been used and can't be reused. Please create a new one." ) while True: if self.limit <= PAGE_SIZE_BE_LIMIT: offset = self.chunks_requested self.chunks_requested += self.limit yield SearchPaginationByOffset(limit=self.limit, offset=offset) return else: self.limit -= PAGE_SIZE_BE_LIMIT offset = self.chunks_requested self.chunks_requested += PAGE_SIZE_BE_LIMIT yield SearchPaginationByOffset(limit=PAGE_SIZE_BE_LIMIT, offset=offset)