Source code for bigdata_client.document

import datetime
from contextlib import suppress
from dataclasses import dataclass
from functools import cached_property
from typing import ForwardRef, Optional

from pydantic import BaseModel

from bigdata_client.api.knowledge_graph import ByIdsRequest
from bigdata_client.api.search import ChunkedDocumentResponse
from bigdata_client.connection_protocol import BigdataConnectionProtocol
from bigdata_client.models.advanced_search_query import TranscriptTypes
from bigdata_client.models.document import (
    DocumentChunk,
    DocumentScope,
    DocumentSentence,
    DocumentSentenceEntity,
    DocumentSource,
)
from bigdata_client.query_type import QueryType

Document = ForwardRef("Document")


[docs] class Document(BaseModel): """A document object""" id: str headline: str sentiment: float document_scope: DocumentScope source: DocumentSource timestamp: datetime.datetime chunks: list[DocumentChunk] language: str # Keeps track of the connection to Bigdata _api: BigdataConnectionProtocol cluster: Optional[list[Document]] = None reporting_period: Optional[list[str]] = None document_type: Optional[str] = None reporting_entities: Optional[list[str]] = None url: Optional[str] = None
[docs] class Config: arbitrary_types_allowed = True
def model_post_init(self, __context): """All returned timestamps are in UTC""" self.timestamp = self.timestamp.replace(tzinfo=datetime.timezone.utc) def __init__(self, **data): super().__init__(**data) if "_api" in data: self._api = data["_api"] @cached_property def resolved_reporting_entities(self): if not self.reporting_entities: return None by_ids_results = self._api.by_ids( ByIdsRequest.model_validate( ( {"key": entity_id, "queryType": QueryType.ENTITY} for entity_id in self.reporting_entities ) ) ) return [entity.name for entity in by_ids_results.root.values()]
[docs] @classmethod def from_response( cls, response: ChunkedDocumentResponse, api: BigdataConnectionProtocol ) -> "Document": source = DocumentSource( key=response.source_key, name=response.source_name, rank=response.source_rank, ) chunks = [ DocumentChunk( text=s.text, chunk=s.cnum, entities=[ DocumentSentenceEntity( key=e.key, start=e.start, end=e.end, query_type=e.queryType ) for e in s.entities ], sentences=[ DocumentSentence(paragraph=e.pnum, sentence=e.snum) for e in s.sentences ], relevance=s.relevance, sentiment=s.sentiment / 100.0, section_metadata=s.section_metadata, speaker=s.speaker, _api=api, ) for s in response.chunks ] return cls( id=response.id, headline=response.headline, sentiment=response.sentiment / 100.0, document_scope=response.document_scope, document_type=response.document_type, source=source, timestamp=response.timestamp, chunks=chunks, language=response.language, reporting_entities=response.reporting_entities, cluster=( [ Document.from_response(doc_chunk, api) for doc_chunk in response.cluster ] if response.cluster else None ), _api=api, url=response.url, )
[docs] def download_annotated_dict(self) -> dict: """Returns annotated document as a dictionary.""" return self._api.download_annotated_dict(self.id)
def __str__(self) -> str: """ Returns a string representation of the document. """ def _format_section(section_name: str, value: str, left_padding=50): section_name = str(section_name) value = str(value) dynamic_padding = ( left_padding - len(section_name) if left_padding > len(section_name) else 0 ) padded_value = value.rjust(dynamic_padding) return f"{section_name}: {padded_value}" def _get_document_type_repr(): with suppress(ValueError): TranscriptTypes(self.document_type) return _format_section("Document Type", self.document_type.title()) def _get_chunk_repr(chunk_: DocumentChunk): section = ( _format_section("Section", str(chunk_.section_metadata)) if chunk_.section_metadata else None ) speaker = ( _format_section("Speaker", chunk_.resolved_speaker) if chunk_.speaker and chunk_.resolved_speaker else None ) return section, speaker, f"*{chunk_.text}\n--" chunks_repr = [ chunk_row_repr for chunk in self.chunks for chunk_row_repr in _get_chunk_repr(chunk) ] reporting_entities_repr = ( _format_section("Reporting Entity", str(self.resolved_reporting_entities)) if self.reporting_entities else None ) document_id = _format_section("Document ID", self.id) timestamp = _format_section( "Timestamp", self.timestamp.strftime("%Y-%m-%d %H:%M:%S") ) scope = _format_section("Scope", self.document_scope.value.title()) document_type = _get_document_type_repr() source = _format_section( "Source (Rank)", f"{self.source.name} ({self.source.rank})" ) title = _format_section("Title", self.headline) document_url = _format_section("Document Url", self.url) if self.url else None language = _format_section("Language", self.language) sentiment = _format_section("Sentiment", str(self.sentiment)) chunks_separator = f"====Sentence matches====" return "\n".join( filter( None, ( document_id, timestamp, scope, document_type, source, title, document_url, reporting_entities_repr, language, sentiment, chunks_separator, *chunks_repr, ), ) )
Document.model_rebuild()