blob: fb0428f4e479d9ff0dbec5b7539c8673fc14083c [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Document module providing Document and Metadata classes for document handling.
This module implements classes for representing documents and their associated metadata
in the HugeGraph LLM system.
"""
from typing import Any, Dict, Optional, Union
class Metadata:
"""A class representing metadata for a document.
This class stores metadata information like source, author, page, etc.
"""
def __init__(self, **kwargs):
"""Initialize metadata with arbitrary key-value pairs.
Args:
**kwargs: Arbitrary keyword arguments to be stored as metadata.
"""
for key, value in kwargs.items():
setattr(self, key, value)
def as_dict(self) -> Dict[str, Any]:
"""Convert metadata to a dictionary.
Returns:
Dict[str, Any]: A dictionary representation of metadata.
"""
return dict(self.__dict__)
class Document:
"""A class representing a document with content and metadata.
This class stores document content along with its associated metadata.
"""
def __init__(self, content: str, metadata: Optional[Union[Dict[str, Any], Metadata]] = None):
"""Initialize a document with content and metadata.
Args:
content: The text content of the document.
metadata: Metadata associated with the document. Can be a dictionary or Metadata object.
Raises:
ValueError: If content is None or empty string.
"""
if not content:
raise ValueError("Document content cannot be None or empty")
self.content = content
if metadata is None:
self.metadata = {}
elif isinstance(metadata, Metadata):
self.metadata = metadata.as_dict()
else:
self.metadata = metadata