| import email.feedparser |
| import email.header |
| import email.message |
| import email.parser |
| import email.policy |
| import sys |
| import typing |
| from typing import Dict, List, Optional, Tuple, Union, cast |
| |
| if sys.version_info >= (3, 8): # pragma: no cover |
| from typing import TypedDict |
| else: # pragma: no cover |
| if typing.TYPE_CHECKING: |
| from typing_extensions import TypedDict |
| else: |
| try: |
| from typing_extensions import TypedDict |
| except ImportError: |
| |
| class TypedDict: |
| def __init_subclass__(*_args, **_kwargs): |
| pass |
| |
| |
| # The RawMetadata class attempts to make as few assumptions about the underlying |
| # serialization formats as possible. The idea is that as long as a serialization |
| # formats offer some very basic primitives in *some* way then we can support |
| # serializing to and from that format. |
| class RawMetadata(TypedDict, total=False): |
| """A dictionary of raw core metadata. |
| |
| Each field in core metadata maps to a key of this dictionary (when data is |
| provided). The key is lower-case and underscores are used instead of dashes |
| compared to the equivalent core metadata field. Any core metadata field that |
| can be specified multiple times or can hold multiple values in a single |
| field have a key with a plural name. |
| |
| Core metadata fields that can be specified multiple times are stored as a |
| list or dict depending on which is appropriate for the field. Any fields |
| which hold multiple values in a single field are stored as a list. |
| |
| """ |
| |
| # Metadata 1.0 - PEP 241 |
| metadata_version: str |
| name: str |
| version: str |
| platforms: List[str] |
| summary: str |
| description: str |
| keywords: List[str] |
| home_page: str |
| author: str |
| author_email: str |
| license: str |
| |
| # Metadata 1.1 - PEP 314 |
| supported_platforms: List[str] |
| download_url: str |
| classifiers: List[str] |
| requires: List[str] |
| provides: List[str] |
| obsoletes: List[str] |
| |
| # Metadata 1.2 - PEP 345 |
| maintainer: str |
| maintainer_email: str |
| requires_dist: List[str] |
| provides_dist: List[str] |
| obsoletes_dist: List[str] |
| requires_python: str |
| requires_external: List[str] |
| project_urls: Dict[str, str] |
| |
| # Metadata 2.0 |
| # PEP 426 attempted to completely revamp the metadata format |
| # but got stuck without ever being able to build consensus on |
| # it and ultimately ended up withdrawn. |
| # |
| # However, a number of tools had started emiting METADATA with |
| # `2.0` Metadata-Version, so for historical reasons, this version |
| # was skipped. |
| |
| # Metadata 2.1 - PEP 566 |
| description_content_type: str |
| provides_extra: List[str] |
| |
| # Metadata 2.2 - PEP 643 |
| dynamic: List[str] |
| |
| # Metadata 2.3 - PEP 685 |
| # No new fields were added in PEP 685, just some edge case were |
| # tightened up to provide better interoptability. |
| |
| |
| _STRING_FIELDS = { |
| "author", |
| "author_email", |
| "description", |
| "description_content_type", |
| "download_url", |
| "home_page", |
| "license", |
| "maintainer", |
| "maintainer_email", |
| "metadata_version", |
| "name", |
| "requires_python", |
| "summary", |
| "version", |
| } |
| |
| _LIST_STRING_FIELDS = { |
| "classifiers", |
| "dynamic", |
| "obsoletes", |
| "obsoletes_dist", |
| "platforms", |
| "provides", |
| "provides_dist", |
| "provides_extra", |
| "requires", |
| "requires_dist", |
| "requires_external", |
| "supported_platforms", |
| } |
| |
| |
| def _parse_keywords(data: str) -> List[str]: |
| """Split a string of comma-separate keyboards into a list of keywords.""" |
| return [k.strip() for k in data.split(",")] |
| |
| |
| def _parse_project_urls(data: List[str]) -> Dict[str, str]: |
| """Parse a list of label/URL string pairings separated by a comma.""" |
| urls = {} |
| for pair in data: |
| # Our logic is slightly tricky here as we want to try and do |
| # *something* reasonable with malformed data. |
| # |
| # The main thing that we have to worry about, is data that does |
| # not have a ',' at all to split the label from the Value. There |
| # isn't a singular right answer here, and we will fail validation |
| # later on (if the caller is validating) so it doesn't *really* |
| # matter, but since the missing value has to be an empty str |
| # and our return value is dict[str, str], if we let the key |
| # be the missing value, then they'd have multiple '' values that |
| # overwrite each other in a accumulating dict. |
| # |
| # The other potentional issue is that it's possible to have the |
| # same label multiple times in the metadata, with no solid "right" |
| # answer with what to do in that case. As such, we'll do the only |
| # thing we can, which is treat the field as unparseable and add it |
| # to our list of unparsed fields. |
| parts = [p.strip() for p in pair.split(",", 1)] |
| parts.extend([""] * (max(0, 2 - len(parts)))) # Ensure 2 items |
| |
| # TODO: The spec doesn't say anything about if the keys should be |
| # considered case sensitive or not... logically they should |
| # be case-preserving and case-insensitive, but doing that |
| # would open up more cases where we might have duplicate |
| # entries. |
| label, url = parts |
| if label in urls: |
| # The label already exists in our set of urls, so this field |
| # is unparseable, and we can just add the whole thing to our |
| # unparseable data and stop processing it. |
| raise KeyError("duplicate labels in project urls") |
| urls[label] = url |
| |
| return urls |
| |
| |
| def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: |
| """Get the body of the message.""" |
| # If our source is a str, then our caller has managed encodings for us, |
| # and we don't need to deal with it. |
| if isinstance(source, str): |
| payload: str = msg.get_payload() |
| return payload |
| # If our source is a bytes, then we're managing the encoding and we need |
| # to deal with it. |
| else: |
| bpayload: bytes = msg.get_payload(decode=True) |
| try: |
| return bpayload.decode("utf8", "strict") |
| except UnicodeDecodeError: |
| raise ValueError("payload in an invalid encoding") |
| |
| |
| # The various parse_FORMAT functions here are intended to be as lenient as |
| # possible in their parsing, while still returning a correctly typed |
| # RawMetadata. |
| # |
| # To aid in this, we also generally want to do as little touching of the |
| # data as possible, except where there are possibly some historic holdovers |
| # that make valid data awkward to work with. |
| # |
| # While this is a lower level, intermediate format than our ``Metadata`` |
| # class, some light touch ups can make a massive difference in usability. |
| |
| # Map METADATA fields to RawMetadata. |
| _EMAIL_TO_RAW_MAPPING = { |
| "author": "author", |
| "author-email": "author_email", |
| "classifier": "classifiers", |
| "description": "description", |
| "description-content-type": "description_content_type", |
| "download-url": "download_url", |
| "dynamic": "dynamic", |
| "home-page": "home_page", |
| "keywords": "keywords", |
| "license": "license", |
| "maintainer": "maintainer", |
| "maintainer-email": "maintainer_email", |
| "metadata-version": "metadata_version", |
| "name": "name", |
| "obsoletes": "obsoletes", |
| "obsoletes-dist": "obsoletes_dist", |
| "platform": "platforms", |
| "project-url": "project_urls", |
| "provides": "provides", |
| "provides-dist": "provides_dist", |
| "provides-extra": "provides_extra", |
| "requires": "requires", |
| "requires-dist": "requires_dist", |
| "requires-external": "requires_external", |
| "requires-python": "requires_python", |
| "summary": "summary", |
| "supported-platform": "supported_platforms", |
| "version": "version", |
| } |
| |
| |
| def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[str]]]: |
| """Parse a distribution's metadata. |
| |
| This function returns a two-item tuple of dicts. The first dict is of |
| recognized fields from the core metadata specification. Fields that can be |
| parsed and translated into Python's built-in types are converted |
| appropriately. All other fields are left as-is. Fields that are allowed to |
| appear multiple times are stored as lists. |
| |
| The second dict contains all other fields from the metadata. This includes |
| any unrecognized fields. It also includes any fields which are expected to |
| be parsed into a built-in type but were not formatted appropriately. Finally, |
| any fields that are expected to appear only once but are repeated are |
| included in this dict. |
| |
| """ |
| raw: Dict[str, Union[str, List[str], Dict[str, str]]] = {} |
| unparsed: Dict[str, List[str]] = {} |
| |
| if isinstance(data, str): |
| parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data) |
| else: |
| parsed = email.parser.BytesParser(policy=email.policy.compat32).parsebytes(data) |
| |
| # We have to wrap parsed.keys() in a set, because in the case of multiple |
| # values for a key (a list), the key will appear multiple times in the |
| # list of keys, but we're avoiding that by using get_all(). |
| for name in frozenset(parsed.keys()): |
| # Header names in RFC are case insensitive, so we'll normalize to all |
| # lower case to make comparisons easier. |
| name = name.lower() |
| |
| # We use get_all() here, even for fields that aren't multiple use, |
| # because otherwise someone could have e.g. two Name fields, and we |
| # would just silently ignore it rather than doing something about it. |
| headers = parsed.get_all(name) |
| |
| # The way the email module works when parsing bytes is that it |
| # unconditionally decodes the bytes as ascii using the surrogateescape |
| # handler. When you pull that data back out (such as with get_all() ), |
| # it looks to see if the str has any surrogate escapes, and if it does |
| # it wraps it in a Header object instead of returning the string. |
| # |
| # As such, we'll look for those Header objects, and fix up the encoding. |
| value = [] |
| # Flag if we have run into any issues processing the headers, thus |
| # signalling that the data belongs in 'unparsed'. |
| valid_encoding = True |
| for h in headers: |
| # It's unclear if this can return more types than just a Header or |
| # a str, so we'll just assert here to make sure. |
| assert isinstance(h, (email.header.Header, str)) |
| |
| # If it's a header object, we need to do our little dance to get |
| # the real data out of it. In cases where there is invalid data |
| # we're going to end up with mojibake, but there's no obvious, good |
| # way around that without reimplementing parts of the Header object |
| # ourselves. |
| # |
| # That should be fine since, if mojibacked happens, this key is |
| # going into the unparsed dict anyways. |
| if isinstance(h, email.header.Header): |
| # The Header object stores it's data as chunks, and each chunk |
| # can be independently encoded, so we'll need to check each |
| # of them. |
| chunks: List[Tuple[bytes, Optional[str]]] = [] |
| for bin, encoding in email.header.decode_header(h): |
| try: |
| bin.decode("utf8", "strict") |
| except UnicodeDecodeError: |
| # Enable mojibake. |
| encoding = "latin1" |
| valid_encoding = False |
| else: |
| encoding = "utf8" |
| chunks.append((bin, encoding)) |
| |
| # Turn our chunks back into a Header object, then let that |
| # Header object do the right thing to turn them into a |
| # string for us. |
| value.append(str(email.header.make_header(chunks))) |
| # This is already a string, so just add it. |
| else: |
| value.append(h) |
| |
| # We've processed all of our values to get them into a list of str, |
| # but we may have mojibake data, in which case this is an unparsed |
| # field. |
| if not valid_encoding: |
| unparsed[name] = value |
| continue |
| |
| raw_name = _EMAIL_TO_RAW_MAPPING.get(name) |
| if raw_name is None: |
| # This is a bit of a weird situation, we've encountered a key that |
| # we don't know what it means, so we don't know whether it's meant |
| # to be a list or not. |
| # |
| # Since we can't really tell one way or another, we'll just leave it |
| # as a list, even though it may be a single item list, because that's |
| # what makes the most sense for email headers. |
| unparsed[name] = value |
| continue |
| |
| # If this is one of our string fields, then we'll check to see if our |
| # value is a list of a single item. If it is then we'll assume that |
| # it was emitted as a single string, and unwrap the str from inside |
| # the list. |
| # |
| # If it's any other kind of data, then we haven't the faintest clue |
| # what we should parse it as, and we have to just add it to our list |
| # of unparsed stuff. |
| if raw_name in _STRING_FIELDS and len(value) == 1: |
| raw[raw_name] = value[0] |
| # If this is one of our list of string fields, then we can just assign |
| # the value, since email *only* has strings, and our get_all() call |
| # above ensures that this is a list. |
| elif raw_name in _LIST_STRING_FIELDS: |
| raw[raw_name] = value |
| # Special Case: Keywords |
| # The keywords field is implemented in the metadata spec as a str, |
| # but it conceptually is a list of strings, and is serialized using |
| # ", ".join(keywords), so we'll do some light data massaging to turn |
| # this into what it logically is. |
| elif raw_name == "keywords" and len(value) == 1: |
| raw[raw_name] = _parse_keywords(value[0]) |
| # Special Case: Project-URL |
| # The project urls is implemented in the metadata spec as a list of |
| # specially-formatted strings that represent a key and a value, which |
| # is fundamentally a mapping, however the email format doesn't support |
| # mappings in a sane way, so it was crammed into a list of strings |
| # instead. |
| # |
| # We will do a little light data massaging to turn this into a map as |
| # it logically should be. |
| elif raw_name == "project_urls": |
| try: |
| raw[raw_name] = _parse_project_urls(value) |
| except KeyError: |
| unparsed[name] = value |
| # Nothing that we've done has managed to parse this, so it'll just |
| # throw it in our unparseable data and move on. |
| else: |
| unparsed[name] = value |
| |
| # We need to support getting the Description from the message payload in |
| # addition to getting it from the the headers. This does mean, though, there |
| # is the possibility of it being set both ways, in which case we put both |
| # in 'unparsed' since we don't know which is right. |
| try: |
| payload = _get_payload(parsed, data) |
| except ValueError: |
| unparsed.setdefault("description", []).append( |
| parsed.get_payload(decode=isinstance(data, bytes)) |
| ) |
| else: |
| if payload: |
| # Check to see if we've already got a description, if so then both |
| # it, and this body move to unparseable. |
| if "description" in raw: |
| description_header = cast(str, raw.pop("description")) |
| unparsed.setdefault("description", []).extend( |
| [description_header, payload] |
| ) |
| elif "description" in unparsed: |
| unparsed["description"].append(payload) |
| else: |
| raw["description"] = payload |
| |
| # We need to cast our `raw` to a metadata, because a TypedDict only support |
| # literal key names, but we're computing our key names on purpose, but the |
| # way this function is implemented, our `TypedDict` can only have valid key |
| # names. |
| return cast(RawMetadata, raw), unparsed |