| #!/usr/bin/env python3 |
| # -*- coding: utf-8 -*- |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| r""" |
| ======== |
| DKIM-IDs |
| ======== |
| |
| The recommended Ponymail ID generator is the DKIM-ID generator. It |
| simplifies a message using an algorithm based on DKIM relaxed/simple |
| canonicalisation, hashes it with an SHA-256 HMAC, and then encodes the |
| truncated digest using base32 with the custom alphabet ``0-9 b-d f-h |
| j-t v-z`` and the padding stripped. |
| |
| |
| DKIM-IDs test suite |
| =================== |
| |
| As well as plain Python doctests, we also use the hypothesis package |
| to check properties of the DKIM-ID generator algorithm. This has the |
| advantage of providing a kind of partial specification as well as |
| testing the code. The suite can be run using:: |
| |
| PYTHONPATH=../tools python3 dkim_id_test.py |
| |
| And exported to HTML using docutils and the command:: |
| |
| HTML=1 PYTHONPATH=../tools \ |
| python3 dkim_id_test.py > dkim_id_test.html |
| |
| |
| RFC5322 line ending normalisation |
| --------------------------------- |
| |
| The first step of generating a DKIM-ID is to convert all line endings |
| of the input to CRLF by upgrading bare CR and LF characters. |
| |
| If the message is submitted to the Signer with any local encoding |
| that will be modified before transmission, that modification to |
| canonical [RFC5322] form MUST be done before signing. In particular, |
| bare CR or LF characters (used by some systems as a local line |
| separator convention) MUST be converted to the SMTP-standard CRLF |
| sequence before the message is signed. |
| |
| https://tools.ietf.org/html/rfc6376#section-5.3 |
| |
| We follow the algorithm used in dkim_header in dkim.c in version 2.10 |
| of libopendkim, the implementation of which is this, reformatted for |
| brevity:: |
| |
| for (p = hdr; p < q && *p != '\0'; p++) { |
| if (*p == '\n' && prev != '\r') { /* bare LF */ |
| dkim_dstring_catn(tmphdr, CRLF, 2); |
| } else if (prev == '\r' && *p != '\n') { /* bare CR */ |
| dkim_dstring_cat1(tmphdr, '\n'); |
| dkim_dstring_cat1(tmphdr, *p); |
| } else { /* other */ |
| dkim_dstring_cat1(tmphdr, *p); |
| } |
| prev = *p; |
| } |
| if (prev == '\r') { /* end CR */ |
| dkim_dstring_cat1(tmphdr, '\n'); |
| } |
| |
| Our version of this algorithm is called ``rfc5322_endings``. |
| |
| >>> from dkim_id import rfc5322_endings |
| |
| It works on bytes and produces bytes. |
| |
| We test properties of the DKIM-ID related functions not by formally |
| proving them, as there are no mainstream frameworks for formal |
| verification of Python (though Nagini may be worth trying), but |
| instead by fuzzing with hypothesis as a property checker. |
| |
| >>> from hypothesis import given |
| >>> from hypothesis.strategies import from_regex as regex, text |
| |
| The regex producer outputs str instances, and we use it because |
| hypothesis does not allow us to use patterns or other smart generation |
| with only bytes. Therefore we use the smart str generators and then |
| convert the output to bytes using cp1252 or utf-8 encoding as |
| necessary. |
| |
| >>> def cp1252(text: str) -> bytes: |
| ... return bytes(text, "cp1252") |
| >>> def utf8(text: str): |
| ... return bytes(text, "utf-8") |
| |
| We'll also use our own decorator to make tests run automatically. |
| |
| >>> def thesis(hypo, *args): |
| ... def decorator(func): |
| ... func = hypo(*args)(func) |
| ... func() |
| ... return func |
| ... return decorator |
| |
| Since ``rfc5322_endings`` only converts endings, sequences containing |
| neither CR nor LF are unaffected. |
| |
| >>> @thesis(given, regex(r"\A[^\r\n]*\Z")) |
| ... def non_cr_lf_unaffected(text: str) -> None: |
| ... data: bytes = utf8(text) |
| ... assert data == rfc5322_endings(data), repr(data) |
| |
| The algorithm is that any LF not preceded with CR will have one |
| inserted before it, and likewise for CR not followed by LF. Therefore |
| we expect the result to always have the same number of CR and LFs. |
| |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def cr_lf_same_cardinality(text: str) -> None: |
| ... data: bytes = rfc5322_endings(utf8(text)) |
| ... crs = data.count(b"\r") |
| ... lfs = data.count(b"\n") |
| ... assert crs == lfs, repr(data) |
| |
| That the number of CRs or LFs will never be reduced. |
| |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def cr_lf_no_reduce(text: str) -> None: |
| ... a: bytes = utf8(text) |
| ... b: bytes = rfc5322_endings(a) |
| ... assert b.count(b"\r") >= a.count(b"\r"), repr(data) |
| ... assert b.count(b"\n") >= a.count(b"\n"), repr(data) |
| |
| That if we delete all CRLF subsequences, there will be no CR or LFs |
| remaining in the sequence. |
| |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def only_crlf_subsequences(text: str) -> None: |
| ... data: bytes = rfc5322_endings(utf8(text)) |
| ... data = data.replace(b"\r\n", b".") |
| ... assert data.count(b"\r") == 0, repr(data) |
| ... assert data.count(b"\n") == 0, repr(data) |
| |
| That if we split on CR or LF sequences, the input and output will be |
| the same. |
| |
| >>> @thesis(given, text(alphabet="\r\nabc. ")) |
| ... def non_crlf_subsequences(text: str) -> None: |
| ... def split(data: bytes): |
| ... data = data.replace(b"\r", b"\n") |
| ... while b"\n\n" in data: |
| ... data = data.replace(b"\n\n", b"\n") |
| ... return data.strip(b"\n").split(b"\n") |
| ... data: bytes = utf8(text) |
| ... expected = split(data) |
| ... normed: bytes = rfc5322_endings(data) |
| ... assert split(normed) == expected, repr(data) |
| |
| And that all of this is equivalent to saying that every CR is now |
| followed by LF and every LF is preceded by CR. |
| |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def cr_and_lf_pairs(text: str) -> None: |
| ... data: bytes = rfc5322_endings(utf8(text)) |
| ... if b"\r" in data: |
| ... datum: bytes |
| ... for datum in data.split(b"\r")[1:]: |
| ... assert datum.startswith(b"\n"), repr(data) |
| ... if b"\n" in data: |
| ... datum: bytes |
| ... for datum in data.split(b"\n")[:-1]: |
| ... assert datum.endswith(b"\r"), repr(data) |
| |
| Most importantly, the number of CRLFs in the output must be equal to |
| the number of CRLFs in the input, plus the number of individual CRs |
| and LFs once the CRLFs have been removed. |
| |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def crlf_count(text: str) -> None: |
| ... nocrlf = text.replace("\r\n", "") |
| ... expected = text.count("\r\n") |
| ... expected += nocrlf.count("\r") |
| ... expected += nocrlf.count("\n") |
| ... data: bytes = rfc5322_endings(utf8(text)) |
| ... assert data.count(b"\r\n") == expected, repr(text) |
| |
| We'll now give a few examples. First, with no CR or LF. |
| |
| >>> rfc5322_endings(b"") |
| b'' |
| >>> rfc5322_endings(b"abc") |
| b'abc' |
| |
| All of the following are equivalent to CRLF. |
| |
| >>> rfc5322_endings(b"\r") |
| b'\r\n' |
| >>> rfc5322_endings(b"\n") |
| b'\r\n' |
| >>> rfc5322_endings(b"\r\n") |
| b'\r\n' |
| |
| And the following are equivalent to CRLF CRLF. |
| |
| >>> rfc5322_endings(b"\r\r") |
| b'\r\n\r\n' |
| >>> rfc5322_endings(b"\n\n") |
| b'\r\n\r\n' |
| >>> rfc5322_endings(b"\n\r") |
| b'\r\n\r\n' |
| |
| |
| DKIM relaxed head canonicalisation |
| ---------------------------------- |
| |
| The next important component of DKIM-ID generation is DKIM head |
| canonicalisation using the relaxed canonicalisation algorithm. The |
| algorithm is not trivial, consisting of five separate steps: |
| |
| * Convert all header field names (not the header field values) to |
| lowercase. For example, convert "SUBJect: AbC" to "subject: AbC". |
| |
| * Unfold all header field continuation lines as described in |
| [RFC5322]; in particular, lines with terminators embedded in |
| continued header field values (that is, CRLF sequences followed by |
| WSP) MUST be interpreted without the CRLF. Implementations MUST |
| NOT remove the CRLF at the end of the header field value. |
| |
| * Convert all sequences of one or more WSP characters to a single SP |
| character. WSP characters here include those before and after a |
| line folding boundary. |
| |
| * Delete all WSP characters at the end of each unfolded header field |
| value. |
| |
| * Delete any WSP characters remaining before and after the colon |
| separating the header field name from the header field value. The |
| colon separator MUST be retained. |
| |
| https://tools.ietf.org/html/rfc6376#section-3.4.2 |
| |
| We'll use hypothesis to check each of these properties in turn. The |
| canonicalisation function is called ``rfc6376_relaxed_head``. |
| |
| >>> from dkim_id import rfc6376_relaxed_head |
| |
| And to test it, we'll need the lists producer from hypothesis. |
| |
| >>> from hypothesis.strategies import lists |
| >>> chars = text(alphabet="\x00\t\r\n\f .ABCabc\xc0").map(cp1252) |
| >>> headers = lists(lists(chars, min_size=2, max_size=2)) |
| |
| |
| Step one |
| ~~~~~~~~ |
| |
| Step one is to convert header field names only to lowercase. Since |
| other normalisation steps will occur, to test it we need to take only |
| the alphabetical octets. |
| |
| >>> def alphabetical(data: bytes) -> bytes: |
| ... from typing import Set |
| ... upper: bytes = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| ... alpha: Set[int] = set(upper + upper.lower()) |
| ... return bytes([b for b in data if b in alpha]) |
| |
| Then we can make a direct comparison. |
| |
| >>> @thesis(given, headers) |
| ... def step_1_field_names_lower(headers) -> None: |
| ... ks = [alphabetical(kv[0]) for kv in headers] |
| ... for i, (k, v) in enumerate(rfc6376_relaxed_head(headers)): |
| ... assert ks[i].lower() == alphabetical(k), repr(headers) |
| |
| Including that values use the same case. |
| |
| >>> @thesis(given, headers) |
| ... def step_1_field_values_case(headers) -> None: |
| ... vs = [kv[1] for kv in headers] |
| ... alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| ... cases = set(alpha + alpha.lower()) |
| ... for i, (k, v) in enumerate(rfc6376_relaxed_head(headers)): |
| ... assert (set(vs[i]) & cases) == (set(v) & cases), repr(headers) |
| |
| |
| Step two |
| ~~~~~~~~ |
| |
| Step two is to unfold continuations by removing CRLF except at the |
| end. This would only produce consistent results if the value is in |
| ``rfc5322_endings`` normal form, so we extend the step to remove all |
| CR or LF, except for a trailing CRLF in the header field value. |
| |
| >>> rfc6376_relaxed_head([[b"", b"\r"]]) |
| [[b'', b'']] |
| >>> rfc6376_relaxed_head([[b"", b"\n"]]) |
| [[b'', b'']] |
| >>> rfc6376_relaxed_head([[b"", b"\r\n"]]) |
| [[b'', b'\r\n']] |
| >>> rfc6376_relaxed_head([[b"", b"...\r"]]) |
| [[b'', b'...']] |
| >>> rfc6376_relaxed_head([[b"", b"...\n"]]) |
| [[b'', b'...']] |
| >>> rfc6376_relaxed_head([[b"", b"...\r\n"]]) |
| [[b'', b'...\r\n']] |
| >>> rfc6376_relaxed_head([[b"", b"a\rb\r\n"]]) |
| [[b'', b'ab\r\n']] |
| >>> rfc6376_relaxed_head([[b"", b"a\nb\r\n"]]) |
| [[b'', b'ab\r\n']] |
| >>> rfc6376_relaxed_head([[b"", b"a\r\nb\r\n"]]) |
| [[b'', b'ab\r\n']] |
| |
| We do this even though, for example, ``b"a\r\nb\r\n"`` is not a |
| possible header field value because the first CRLF is not followed by |
| a space or a tab, meaning that it is not a continuation. |
| |
| We apply the CR and LF removal to header field names too, following |
| libopendkim, although ``rfc6376_relaxed_head`` should never encounter |
| CR or LF in a header field name during DKIM-ID generation. The removal |
| of CR and LF in header names includes CRLF at the end of a header |
| field name, unlike in a header field value where trailing CRLF is |
| retained. |
| |
| >>> rfc6376_relaxed_head([[b"...\r\n", b""]]) |
| [[b'...', b'']] |
| |
| >>> header_text = (text(alphabet="\x00\t\r\n\f .ABCabc\xc0") |
| ... .map(cp1252) |
| ... .map(rfc5322_endings)) |
| >>> wild_headers = lists(lists(header_text, min_size=2, max_size=2)) |
| |
| The ``wild_headers`` producer gives us headers which have not been |
| normalised, and can therefore be used to test the extended step, |
| e.g. for CR and LF deletion. |
| |
| >>> @thesis(given, wild_headers) |
| ... def step_2_cr_lf_deletion(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert b"\r" not in k, repr(headers) |
| ... assert b"\n" not in k, repr(headers) |
| ... if v.endswith(b"\r\n"): |
| ... v = v[:-2] |
| ... assert b"\r" not in v, repr(headers) |
| ... assert b"\n" not in v, repr(headers) |
| |
| We can also test that any trailing CRLF in a header field value is |
| retained. |
| |
| >>> @thesis(given, wild_headers) |
| ... def step_2_field_values_trailing_crlf(headers) -> None: |
| ... vs = [kv[1] for kv in headers] |
| ... for i, (k, v) in enumerate(rfc6376_relaxed_head(headers)): |
| ... a = vs[i].endswith(b"\r\n") |
| ... b = v.endswith(b"\r\n") |
| ... assert a == b, repr(headers) |
| |
| |
| Step three |
| ~~~~~~~~~~ |
| |
| Step three is to reduce all sequences of spaces or tabs to a single |
| space, i.e. all sequences that match ``[ \t]+`` must be replaced with |
| ``" "``. The RFC sounds like it's saying that step three should be |
| applied to both names and values, but may regard the issue as moot |
| since WSP is not allowed in header names according to RFC 5322: |
| |
| [...] A field name MUST be composed of printable US-ASCII characters |
| (i.e., characters that have values between 33 and 126, inclusive), |
| except colon. |
| |
| https://tools.ietf.org/html/rfc5322#section-2.2 |
| |
| Since RFC 6376 says to convert to RFC 5322 normal form first, that |
| implies removing all characters outside of the range 33 to 126. It is |
| not clear that ignoring characters out of this range, e.g. converting |
| "T\\x00o" to "To", has no detrimental security properties. Neither RFC |
| 4409 section 8 nor RFC 6376 section 3.8 and 8 discuss this issue. The |
| latter simply says that "Signers and Verifiers SHOULD take reasonable |
| steps to ensure that the messages they are processing are valid". |
| |
| In any case, libopendkim also doesn't delete all characters outside |
| the range 33 to 126 in header field names. Instead, it deletes only |
| tab, CR, LF, and space. But RFC 6376 also says in step five to delete |
| "any WSP characters remaining before and after the colon", with |
| "remaining" being the operative word here. This suggests that it did |
| consider the earlier step three to apply to headers too, otherwise the |
| WSP characters would not be "remaining" ones. But if it considered the |
| earlier step three to apply to header field names, then it must also |
| consider that there may be spaces and tabs inside header field names |
| even after RFC 5322 normalisation. Hence, we consider that RFC 6376 is |
| primarily suggesting to apply RFC 5322 *line ending* normalisation, |
| which notably it introduces by saying "in particular" in section |
| 5.3. We also consider that it suggests reducing spaces and tabs to a |
| single space in step three, answering the question of what to do with |
| "T o" (it remains "T o") and "T\\x00o" (it remains "T\\x00o"). |
| |
| In summary, we follow RFC 6376 as literally as possible, contrary to |
| libopendkim in this case, and apply step three to header field names. |
| |
| >>> rfc6376_relaxed_head([[b"Spaced \t \t\tKey", b"Value\r\n"]]) |
| [[b'spaced key', b'Value\r\n']] |
| |
| With this, ``rfc6376_relaxed_head`` accepts arbitrary bytes for names |
| and values, and deals with them in a consistent and considered way, |
| including tab and space other values outside 33 to 126. This also |
| includes retaining colon and semicolon, even though they are |
| problematic in DKIM signing. |
| |
| >>> rfc6376_relaxed_head([[b":", b"Value\r\n"]]) |
| [[b':', b'Value\r\n']] |
| >>> rfc6376_relaxed_head([[b";", b"Value\r\n"]]) |
| [[b';', b'Value\r\n']] |
| |
| In the component of the DKIM-ID generator which uses header |
| canonicalisation it's impossible for it to have colon in the header |
| name, but it is possible for it to have semicolon. Such a header could |
| not be signed using DKIM as it uses semicolon as the separator in the |
| list of headers which have been signed, but it will be ignored in |
| DKIM-ID generation as long as the defaults are followed or ``";"`` is |
| not manually specified as a subset header to keep. Another problematic |
| header which is possible is the empty header. The case of a header |
| name starting with WSP also doesn't arise, because such lines are |
| continuation lines. |
| |
| Overall, there should never be a tab in canonicalised header field |
| names and values, and there should never be a double space in |
| canonicalised header field names and values. |
| |
| >>> @thesis(given, wild_headers) |
| ... def step_3_field_values(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert b"\t" not in k, repr(headers) |
| ... assert b"\t" not in v, repr(headers) |
| ... assert b" " not in k, repr(headers) |
| ... assert b" " not in v, repr(headers) |
| |
| Internally, the function that performs this step is called |
| ``rfc6376_shrink_head``. |
| |
| >>> from dkim_id import rfc6376_shrink_head |
| |
| And it should work like a more efficient version of iteratively |
| removing double spaces, except that it also strips leading and |
| trailing whitespace, which is for steps four and five. |
| |
| >>> @thesis(given, wild_headers) |
| ... def step_3_reduce_iterative(headers) -> None: |
| ... for (k, v) in headers: |
| ... kk = k.replace(b"\t", b" ") |
| ... vv = v.replace(b"\t", b" ") |
| ... while b" " in kk: |
| ... kk = kk.replace(b" ", b" ") |
| ... kk = kk.strip(b" ") |
| ... while b" " in vv: |
| ... vv = vv.replace(b" ", b" ") |
| ... vv = vv.strip(b" ") |
| ... assert rfc6376_shrink_head(k) == kk, repr(k) |
| ... assert rfc6376_shrink_head(v) == vv, repr(v) |
| |
| This also means that leading whitespace is removed from the beginnings |
| of header names. Again this is not a case which could occur during |
| DKIM-ID generation, in this case because such a name would have been |
| regarded as a continuation, even at the beginning of a message where |
| it is regarded as the continuation of the empty name. |
| |
| >>> rfc6376_relaxed_head([[b" Key", b"Value\r\n"]]) |
| [[b'key', b'Value\r\n']] |
| |
| |
| Step four |
| ~~~~~~~~~ |
| |
| Step four says that spaces and tabs at the end of a header field value |
| are removed. |
| |
| It is possible to give a header field value without a trailing CRLF to |
| ``rfc6376_relaxed_head``, and so any trailing tabs or spaces there |
| must be removed. |
| |
| >>> rfc6376_relaxed_head([[b"", b"Value\t "]]) |
| [[b'', b'Value']] |
| |
| But the RFC 5322 message grammar states that all headers shall end |
| with CRLF. An overly literal reading of RFC 6376 therefore implies |
| that spaces and tabs are never removed from the end of a field value, |
| because the value must always end with CRLF according to RFC 5322. But |
| if they were never removed then there would be no need for the step, |
| so the implication is that the "end" for the purposes of this step is |
| before the trailing CRLF. |
| |
| A reading of ``dkim_canon_header_string`` in libopendkim suggests that |
| it could leave a header ending with space CRLF, but this hasn't been |
| tested. We remove the space correctly. |
| |
| >>> rfc6376_relaxed_head([[b"Key", b"Value \r\n"]]) |
| [[b'key', b'Value\r\n']] |
| |
| Indeed, a header field value must never end with space or tab. |
| |
| >>> @thesis(given, wild_headers) |
| ... def step_4_field_values_ends(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert not v.endswith(b" "), repr(headers) |
| ... assert not v.endswith(b"\t"), repr(headers) |
| |
| And must never end with space CRLF or tab CRLF. |
| |
| >>> @thesis(given, wild_headers) |
| ... def step_4_field_values_ends_2(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert not v.endswith(b" \r\n"), repr(headers) |
| ... assert not v.endswith(b"\t\r\n"), repr(headers) |
| |
| Indeed, it should never be possible to contain, let alone end, with a |
| tab anyway after step three since that replaces all sequences of |
| spaces and tabs with a single space, leaving no tabs at all in the |
| output before it reaches step four. |
| |
| |
| Step five |
| ~~~~~~~~~ |
| |
| Step five is to remove spaces and tabs from the end of header names, |
| and from the start of header values. Again, all tabs should have been |
| removed anyway in step three, so this step could have specified only |
| removing spaces. |
| |
| >>> @thesis(given, wild_headers) |
| ... def step_5_wsp_around_colon(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert not k.endswith(b" "), repr(headers) |
| ... assert not k.endswith(b"\t"), repr(headers) |
| ... assert not v.startswith(b" "), repr(headers) |
| ... assert not v.startswith(b"\t"), repr(headers) |
| |
| |
| General properties |
| ~~~~~~~~~~~~~~~~~~ |
| |
| We can combine headers in order to check their size. |
| |
| >>> from dkim_id import rfc6376_join |
| |
| This can be used to test one of the general properties of |
| ``rfc6376_relaxed_head``, that it never enlarges the data given to it. |
| |
| >>> @thesis(given, wild_headers) |
| ... def head_never_enlarged(headers) -> None: |
| ... a: bytes = rfc6376_join(headers) |
| ... h: List[List[bytes]] = rfc6376_relaxed_head(headers) |
| ... b: bytes = rfc6376_join(h) |
| ... assert len(a) >= len(b), repr(headers) |
| |
| Perhaps the most important general property of canonicalisation is |
| that once canonicalised, attempting to canonicalise again produces the |
| same data. In other words canonicalisation is absolute, and data |
| cannot be canonicalised further. |
| |
| >>> @thesis(given, wild_headers) |
| ... def recanonicalisation_is_identity(headers) -> None: |
| ... a = rfc6376_relaxed_head(headers) |
| ... b = rfc6376_relaxed_head(a) |
| ... assert a == b, repr(headers) |
| |
| |
| Simple body canonicalisation |
| ---------------------------- |
| |
| The body canonicalisation function is called ``rfc6376_simple_body``. |
| |
| >>> from dkim_id import rfc6376_simple_body |
| |
| It maps an empty body to CRLF, and then ensures that there is at most |
| one CRLF at the end of the body. Therefore, a consequence is that it |
| ensures that the output is never empty. |
| |
| >>> @thesis(given, chars) |
| ... def body_not_empty(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... assert len(body_c) > 0, repr(body) |
| |
| And that the output never ends CRLF CRLF. |
| |
| >>> @thesis(given, chars) |
| ... def body_no_trailing_crlfcrlf(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... assert not body_c.endswith(b"\r\n\r\n") > 0, repr(body) |
| |
| But it could end non-CR LF CRLF, or CR CRLF if the input were not RFC |
| 5322 ending normalised. |
| |
| >>> rfc6376_simple_body(b"Non-CR\n\r\n") |
| b'Non-CR\n\r\n' |
| >>> rfc6376_simple_body(b"CR\r\r\n") |
| b'CR\r\r\n' |
| |
| The function enlarges data only when its input is empty. |
| |
| >>> @thesis(given, chars.filter(lambda b: b != b"")) |
| ... def body_enlarging_edge(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... assert len(body_c) <= len(body), repr(body) |
| |
| The prefix of the output up to any trailing CRLF the shared by the input. |
| |
| >>> @thesis(given, chars) |
| ... def body_same_prefix(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... size_c = len(body_c) |
| ... if body_c.endswith(b"\r\n"): |
| ... size_c -= 2 |
| ... assert body[:size_c] == body_c[:size_c], repr(body) |
| |
| And any remainder must consist solely of CRLFs in both input and output. |
| |
| >>> @thesis(given, chars) |
| ... def body_suffix_crlfs(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... size_c = len(body_c) |
| ... if body_c.endswith(b"\r\n"): |
| ... size_c -= 2 |
| ... assert not body[size_c:].replace(b"\r\n", b""), repr(body) |
| ... assert not body_c[size_c:].replace(b"\r\n", b""), repr(body) |
| |
| |
| Splitting |
| --------- |
| |
| The main parser is called ``rfc6376_split``. |
| |
| >>> from dkim_id import rfc6376_split |
| |
| It does not perform canonicalisation. If there is no CRLF header and |
| body boundary separator, then it returns None for the body. |
| |
| Each header field is defined by RFC 5322 as ending with CRLF which is |
| inclusive to that header field. Any CRLF following that indicates the |
| start of a body, which may be empty. Therefore, in the case of the |
| empty document there are no headers and no body. |
| |
| >>> rfc6376_split(b"") |
| ([], None) |
| |
| In the case of just CRLF there are no headers, since they must contain |
| at least one character before their CRLF. RFC 5322 section 2.2 says |
| that header fields "are lines beginning with a field name, followed by |
| a colon", which implies at least the presence of a colon, and section |
| 3.6.8 says "field-name = 1*ftext" which means the name must include at |
| least one printable character. As there is nothing after the CRLF in |
| the case of just a CRLF, there is an empty body. |
| |
| >>> rfc6376_split(b"\r\n") |
| ([], b'') |
| |
| In the case of CRLF CRLF there are no headers, and there is a body |
| which is CRLF. |
| |
| >>> rfc6376_split(b"\r\n\r\n") |
| ([], b'\r\n') |
| |
| And then this pattern repeats. |
| |
| >>> rfc6376_split(b"\r\n\r\n\r\n") |
| ([], b'\r\n\r\n') |
| >>> rfc6376_split(b"\r\n\r\n\r\n\r\n") |
| ([], b'\r\n\r\n\r\n') |
| |
| When we have a header, a single trailing CRLF is regarded as part of |
| that header. This means that there is no body. |
| |
| >>> rfc6376_split(b"Key:Value\r\n") |
| ([[b'Key', b'Value\r\n']], None) |
| |
| But appending another CRLF to that gives an empty body. |
| |
| >>> rfc6376_split(b"Key:Value\r\n\r\n") |
| ([[b'Key', b'Value\r\n']], b'') |
| |
| As ``rfc6376_split`` does not perform canonicalisation, we have the |
| edge cases of isolated CRs and LFs. There should never be isolated CRs |
| and LFs in DKIM-ID generation because RFC 5322 ending normalisation is |
| applied before splitting, but in such cases where the function is |
| called with isolated CRs and LFs they are considered as header field |
| name or header field value data. |
| |
| >>> rfc6376_split(b"\r") |
| ([[b'\r', b'']], None) |
| >>> rfc6376_split(b"\n") |
| ([[b'\n', b'']], None) |
| >>> rfc6376_split(b"\n\r\n") |
| ([[b'\n', b'\r\n']], None) |
| >>> rfc6376_split(b"\r\r\n") |
| ([[b'\r', b'\r\n']], None) |
| >>> rfc6376_split(b"\r...\r\n") |
| ([[b'\r...', b'\r\n']], None) |
| >>> rfc6376_split(b"\n...\r\n") |
| ([[b'\n...', b'\r\n']], None) |
| >>> rfc6376_split(b"\n:\n\r\n") |
| ([[b'\n', b'\n\r\n']], None) |
| >>> rfc6376_split(b"\n...:\n...\r\n") |
| ([[b'\n...', b'\n...\r\n']], None) |
| |
| A header field name without any header field value is just regarded as |
| being the same as one with an empty value. |
| |
| >>> rfc6376_split(b"Key\r\n\r\n") |
| ([[b'Key', b'\r\n']], b'') |
| >>> rfc6376_split(b"Key:\r\n\r\n") |
| ([[b'Key', b'\r\n']], b'') |
| |
| For greater consistency with how bodies are handled, the former could |
| have been interpreted as ``[b'Key', None]``, but this would increase |
| the complexity of the code, and lead to the question of where the |
| trailing CRLF ought to be stored. |
| |
| In some cases, one of the mbox formats may accidentally be passed to |
| ``rfc6376_split``, containing a line like this in its headers, usually |
| at the start but potentially later in the headers too: |
| |
| "From MAILER-DAEMON Fri Jul 8 12:08:34 2011" |
| |
| Which would be interpreted as a header field whose name is: |
| |
| "From MAILER-DAEMON Fri Jul 8 12" |
| |
| And which could also collect any following continuation line. |
| |
| >>> rfc6376_split(b"To:You\r\nFrom Me\r\n More\r\n") |
| ([[b'To', b'You\r\n'], [b'From Me', b'\r\n More\r\n']], None) |
| |
| This is safe because even after canonicalisation it is not possible to |
| confuse a ``"From "`` line with a ``"From:"`` header field, unless no |
| text follows the ``"From "`` and it is followed by a continuation. If |
| no text follows the ``"From "`` then it is not in one of the mbox |
| formats anyway. And if it is followed by a continuation, then |
| interpreting it as a From header field is reasonable. |
| |
| Similarly to a name without a value, a continuation value without a |
| preceding line is treated as though the header field name is empty. |
| |
| >>> rfc6376_split(b" More\r\n") |
| ([[b'', b' More\r\n']], None) |
| |
| An alternative to this would be to treat the line itself as a header |
| field name, but then that creates the issue of whether to remove the |
| leading whitespace, and whether to parse a colon in it. It would also |
| make it inconsistent with all other field names, which must not start |
| with a space. |
| |
| The type of the body, the second element of the tuple returned from |
| ``rfc6376_split``, directly correlates to whether the input starts |
| with CRLF or whether CRLF CRLF occurs in the input. If it does so, |
| then we say that the input message contains a header and body |
| boundary. |
| |
| >>> def contains_boundary(data: bytes) -> bool: |
| ... return data.startswith(b"\r\n") or (b"\r\n\r\n" in data) |
| |
| We use a simple subset of all possible inputs to check this |
| correlation. |
| |
| >>> text_message = (text(alphabet="\x00\t\r\n\f .:ABCabc\xc0") |
| ... .map(cp1252)) |
| |
| Although ``rfc6376_split`` should always take input in RFC 5322 ending |
| normal form, we test without that normal form. |
| |
| >>> @thesis(given, text_message) |
| ... def body_type_correlation(data) -> None: |
| ... headers, body = rfc6376_split(data) |
| ... body_not_none = (body is not None) |
| ... assert contains_boundary(data) is body_not_none, repr(data) |
| |
| If the input is not RFC 5322 normalised, then CR and LF can appear in |
| header field names, as already demonstrated. Colon, however, should |
| never appear in a header field name. |
| |
| >>> @thesis(given, text_message) |
| ... def no_split_colon(data) -> None: |
| ... headers, body = rfc6376_split(data) |
| ... for (k, v) in headers: |
| ... assert b":" not in k, repr(data) |
| |
| And if the input is RFC 5322 normalised, then colon, CR, and LF should |
| never appear in header field names. |
| |
| >>> @thesis(given, text_message) |
| ... def no_normal_split_chars(data) -> None: |
| ... data = rfc5322_endings(data) |
| ... headers, body = rfc6376_split(data) |
| ... for (k, v) in headers: |
| ... assert b":" not in k, repr(data) |
| ... assert b"\r" not in k, repr(data) |
| ... assert b"\n" not in k, repr(data) |
| |
| |
| Canonicalised splitting |
| ----------------------- |
| |
| The version of the main parser which performs canonicalisation is |
| called ``rfc6376_split_canon``. |
| |
| >>> from dkim_id import rfc6376_split_canon |
| |
| It takes ``head_subset``, ``head_canon``, and ``body_canon`` |
| arguments. The first is a set of bytes, lower case header field names |
| to keep when parsing the headers. If ``head_subset`` is None, all |
| headers are retained, which is useful for testing. The second is a |
| boolean of whether to apply ``rfc6376_relaxed_head``, and the third is |
| a boolean of whether to apply ``rfc6376_simple_body`` and potentially |
| modify the headers too for consistency. |
| |
| If there was no body, i.e. no header body boundary CRLF in the |
| message, then the returned body should be ``None`` rather than |
| ``b""``. |
| |
| >>> @thesis(given, text_message) |
| ... def body_none(message) -> None: |
| ... boundary = contains_boundary(rfc5322_endings(message)) |
| ... headers, body = rfc6376_split_canon(message) |
| ... assert boundary is (body is not None), repr(message) |
| |
| We can perform the canonicalisation steps ourselves. We need to import |
| ``rfc6376_simple_holistic``, which ensures that headers are augmented |
| with CRLF if necessary when there is either no body or an empty body |
| but body canonicalisation synthesizes one. |
| |
| >>> from dkim_id import rfc6376_simple_holistic |
| |
| And then DKIM relaxed/simple can be applied consistently. |
| |
| >>> @thesis(given, text_message) |
| ... def manual_canon(message) -> None: |
| ... # uc = uncanonicalised, ec = expected canon, ac = actual canon |
| ... headers_uc, body_uc = rfc6376_split_canon(message) |
| ... headers_ec, body_ec = rfc6376_split_canon(message, |
| ... head_canon=True, body_canon=True) |
| ... headers_ac = rfc6376_relaxed_head(headers_uc) |
| ... headers_ac, body_ac = rfc6376_simple_holistic(headers_ac, body_uc) |
| ... assert headers_ac == headers_ec, repr(message) |
| ... assert body_ac == body_ec, repr(message) |
| |
| The header and body canonicalisation steps are optional. Even when |
| retaining all headers (which is the default) and performing neither |
| kind of canonicalisation (which is also the default), the input |
| message is not necessarily the same as the output message, whether RFC |
| 5322 normalisation were performed or not. This is because, for |
| example, the construction of broken headers, i.e. those without |
| colons, is fixed in the process. |
| |
| >>> rfc6376_split_canon(b"Key") |
| ([[b'Key', b'']], None) |
| >>> rfc6376_join(*rfc6376_split_canon(b"Key")) |
| b'Key:' |
| |
| |
| Reformation |
| ----------- |
| |
| We call the process of splitting and then joining "reforming". There |
| is a function called ``rfc6376_reformed`` that performs this. |
| |
| >>> from dkim_id import rfc6376_reformed |
| |
| Then ``rfc6376_reformed`` should be exactly equivalent to using |
| ``rfc6376_split`` and then ``rfc6376_join``. |
| |
| >>> @thesis(given, text_message) |
| ... def normal(message) -> None: |
| ... a = rfc6376_join(*rfc6376_split(message)) |
| ... b = rfc6376_reformed(message) |
| ... assert a == b, repr(message) |
| |
| |
| Canonicalised reformation |
| ------------------------- |
| |
| We can use ``rfc6376_reformed_canon`` to canonicalise a message whilst |
| reforming it. This function accepts an additional ``lid`` parameter to |
| specify a list ID, in the RFC 2919 sense, and returns a list ID and |
| the canonicalised message. The output list ID will be an empty bytes |
| object if the input list ID was in any ``List-Id`` header in the input |
| message. |
| |
| >>> from dkim_id import rfc6376_reformed_canon |
| |
| Then if we make our own headers, canonicalise them, and then join |
| them, we should always get a canonicalised message. |
| |
| >>> @thesis(given, headers) |
| ... def more_manual_canon(headers) -> None: |
| ... headers_c = rfc6376_relaxed_head(headers) |
| ... message_c = rfc6376_join(headers_c) |
| ... assert message_c == rfc6376_reformed_canon(message_c, |
| ... head_canon=True, body_canon=False)[1], repr(message_c) |
| |
| |
| Rascals |
| ------- |
| |
| DKIM-ID generation uses the standard ``rfc6376_reformed_canon`` call |
| with ``rfc4871_subset`` headers and both head and body |
| canonicalised. We refer to this combination as *reformed and |
| relaxed/simple canonicalisation*, or just "rascal" for short. The |
| function that performs this is called ``rfc6376_rascal``. Like |
| ``rfc6376_reformed_canon``, this function accepts an additional |
| ``lid`` parameter to specify a list ID, in the RFC 2919 sense, and |
| returns a list ID and the canonicalised message. |
| |
| >>> from dkim_id import rfc6376_rascal |
| |
| A missing or empty body is encoded, per RFC 6376 simple body |
| canonicalisation, as CRLF. We always perform body canonicalisation if |
| ``body_canon`` is ``True``, which means that even if there is no body |
| (i.e. there was no header and body boundary in the original) there |
| will always be body canonicalisation, which means that the body will |
| always be non-empty, and will always be appended by ``rfc6376_join`` |
| after the header and body separator CRLF. This means that there will |
| always be a header and body boundary in the rascal output. |
| |
| >>> @thesis(given, text_message) |
| ... def rascal_contains_boundary(data) -> None: |
| ... rascal = rfc6376_rascal(data)[1] |
| ... assert contains_boundary(rascal), repr(data) |
| |
| In particular, it means that the empty input document will become CRLF |
| CRLF, which is the header and body separator CRLF followed by the |
| canonicalised empty body CRLF. Two CRLFs, but with completely |
| different roles. |
| |
| >>> rfc6376_rascal(b"") |
| (b'', b'\r\n\r\n') |
| |
| And, because trailing CRs or LFs are RFC 5322 ending normalised and |
| then canonicalised to a single CRLF, it means that any sequence of CRs |
| or LFs will be rascaled to CRLF CRLF too. |
| |
| >>> @thesis(given, text(alphabet="\r\n").map(utf8)) |
| ... def normal_crlfs_to_crlf2(data) -> None: |
| ... rascal = rfc6376_rascal(data)[1] |
| ... assert rascal == b"\r\n\r\n", repr(data) |
| |
| Since the input is considered to be a message, arbitrary text without |
| metacharacters will usually be regarded as a discardable header field. |
| |
| >>> rfc6376_rascal(b"Text") |
| (b'', b'\r\n\r\n') |
| |
| This is true even when colon is included, as long as the prefix is not |
| one of the standard header field names in ``rfc4871_subset``. |
| |
| >>> rfc6376_rascal(b"Discarded: Value") |
| (b'', b'\r\n\r\n') |
| |
| But if the header is in the subset, it will indeed be retained. In |
| this case, holistic canonicalisation ensures that CRLF is appended to |
| the header too. |
| |
| >>> rfc6376_rascal(b"To: Recipient") |
| (b'', b'to:Recipient\r\n\r\n\r\n') |
| |
| In other words this is a header field ``b'to:Recipient\r\n'``, |
| followed by a CRLF header and body boundary, followed by the CRLF of |
| the canonicalised missing body. |
| |
| If there is no header value for a subset header, then it is treated as |
| if the header value were empty. |
| |
| >>> rfc6376_rascal(b"To") |
| (b'', b'to:\r\n\r\n\r\n') |
| >>> rfc6376_rascal(b"To:") |
| (b'', b'to:\r\n\r\n\r\n') |
| |
| RFC 6376 says that canonicalisation should, obviously, come before |
| signing. |
| |
| Canonicalization simply prepares the email for presentation to the |
| signing or verification algorithm. |
| |
| https://tools.ietf.org/html/rfc6376#section-3.4 |
| |
| But a more subtle consequence of this is that subsetting headers also |
| comes after canonicalisation, because subsetting is not part of |
| canonicalisation - it's part of signing. |
| |
| This is important in our expansion of the RFC 6376 algorithm to cover |
| all inputs because e.g. it means that header field names with trailing |
| whitespace are treated the same as without that whitespace. |
| |
| >>> rfc6376_rascal(b"To \n") |
| (b'', b'to:\r\n\r\n\r\n') |
| |
| But a header name with whitespace inside it is not, unlike in the |
| libopendkim algorithm, treated the same as one without whitespace |
| inside it, for reasons already discussed in the documentation of RFC |
| 6376 header canonicalisation step three. |
| |
| >>> rfc6376_rascal(b"T o\n") |
| (b'', b'\r\n\r\n') |
| |
| |
| Header subsetting |
| ----------------- |
| |
| We use a subset of headers specified in RFC 4871. We use RFC 4871 even |
| though it was obsoleted by RFC 6376 because the earlier RFC has a more |
| extensive list of headers, and the later RFC says anyway that the |
| choice of which headers to include is a matter of choice dependent on |
| the signing environment. Since DKIM-ID generation does not even |
| include signing, our requirements are somewhat different anyway. |
| |
| >>> from dkim_id import rfc4871_subset |
| |
| Whenever the ``rfc4871_subset`` headers are specified as the subset to |
| be retained, they should indeed be retained in the output of |
| ``rfc6376_rascal``. |
| |
| >>> for k in rfc4871_subset: |
| ... minimal = k + b":\r\n\r\n\r\n" |
| ... assert minimal == rfc6376_rascal(minimal)[1], repr(minimal) |
| |
| Though the subset is loosely called the "RFC 4871 subset", there is |
| one header in ``rfc4871_subset`` which RFC 4871 doesn't recommend: |
| DKIM-Signature itself. |
| |
| >>> b"dkim-signature" in rfc4871_subset |
| True |
| |
| We include the DKIM-Signature header field in the subset of retained |
| headers because then if the sender has signed their message it ought |
| to be reflected in the identifier for that message. It would not have |
| made sense for RFC 4817 to recommend that header field for signing |
| input, because it is itself the signing output! But if, for example, |
| there were an widely implemented RFC specifying a precursor to DKIM |
| which was later superseded by DKIM, it is reasonable to assume that |
| RFC 4817 would have recommended including the output of the precursor |
| in the headers to sign, combining the two approaches. Similarly, since |
| DKIM is a precursor to DKIM-ID, DKIM-ID is able to include its output |
| as an input. |
| |
| |
| Custom base32 encoding |
| ---------------------- |
| |
| When we have a canonicalised message with subsetted headers, we take |
| the SHA-256 HMAC digest of that message and then encode a truncated |
| version of it using pibble32, which is base32 with the alphabet ``0-9 |
| b-d f-h j-t v-z``, and remove the padding. |
| |
| >>> from dkim_id import pibble32 |
| |
| The alphabet used means that the pibble32 output is always lowercase, |
| and never contains the letters a, e, i, or u. |
| |
| We need the binary producer from hypothesis. |
| |
| >>> from hypothesis.strategies import binary |
| |
| And then we can test these general properties. |
| |
| >>> @thesis(given, binary()) |
| ... def pibble32_general(data) -> None: |
| ... encoded = pibble32(data) |
| ... assert encoded == encoded.lower(), repr(data) |
| ... encoded_set = set(encoded) |
| ... assert not (encoded_set & {"a", "e", "i", "u"}), repr(data) |
| |
| There may be padding, but only when the data length is not divisible |
| by five. |
| |
| >>> @thesis(given, binary()) |
| ... def pibble32_padding(data) -> None: |
| ... encoded = pibble32(data) |
| ... no_padding = not encoded.endswith("=") |
| ... divisible_by_five = not (len(data) % 5) |
| ... assert no_padding is divisible_by_five, repr(data) |
| |
| We strip the padding on the DKIM-ID since it is fixed at a width of |
| 128 bits, and the pibble32 output is byte aligned anyway, i.e. the |
| decoder accepts no other padding than "======". |
| |
| The length of the pibble32 output will always be the same as when |
| base32 encoding it. |
| |
| >>> @thesis(given, binary()) |
| ... def pibble32_length(data) -> None: |
| ... from base64 import b32encode |
| ... assert len(pibble32(data)) == len(b32encode(data)), repr(data) |
| |
| Here are a some specific examples: |
| |
| >>> pibble32(b"") |
| '' |
| >>> pibble32(b"\x00") |
| '00======' |
| >>> pibble32(b"\x01") |
| '04======' |
| >>> pibble32(b"\x02") |
| '08======' |
| >>> pibble32(b"\xff") |
| 'zw======' |
| >>> pibble32(b"\x00\x00\x00\x00\x00") |
| '00000000' |
| >>> pibble32(b"\x00\x00\x01\x00\x00") |
| '00002000' |
| >>> pibble32(b"\x00\x00\x02\x00\x00") |
| '00004000' |
| >>> pibble32(b"\x00\x00\xff\x00\x00") |
| '000hy000' |
| >>> pibble32(b"\x00\x00\xff\xff\x00") |
| '000hzzr0' |
| >>> pibble32(b"\xff\xff\xff\xff\xff") |
| 'zzzzzzzz' |
| |
| When the input length is divisible by five, the output length is |
| always 8 / 5 of that length. |
| |
| >>> @thesis(given, binary()) |
| ... def pibble32_eight_fifths(data) -> None: |
| ... size = len(data) |
| ... resized = size - (size % 5) |
| ... fives = data[:resized] |
| ... assert len(pibble32(fives)) == (resized * 8 / 5), repr(data) |
| |
| And when it's not divisible by five, the length is rounded up to the |
| next number divisible by five. |
| |
| This means that 160 bits of input is multiplied by 8 / 5, which gives |
| 256 bits, or 32 bytes, of output. |
| |
| >>> 160 * 8 // 5 |
| 256 |
| >>> 256 // 8 |
| 32 |
| |
| |
| DKIM-ID generation |
| ------------------ |
| |
| Once the rascaled version of the message is obtained, it it hashed and |
| then pibble32 encoded to form the DKIM-ID. We want to check that the |
| output is pibble32 encoded, at least in that its length is correct and |
| its alphabet is a subset of what is expected. |
| |
| >>> digit = "0123456789" |
| >>> lower = "abcdefghijklmnopqrstuvwxyz" |
| >>> pibble32_alphabet = (set(digit) | set(lower)) - {"a", "e", "i", "u"} |
| |
| We guard against typos in the alphabet by testing expected properties, |
| first by checking the digits. |
| |
| >>> assert len(digit) == 10 |
| >>> assert len(set(digit)) == 10 |
| >>> assert list(digit) == sorted(list(digit)) |
| >>> assert digit.isdigit() |
| |
| Then the lowercase letters. |
| |
| >>> assert len(lower) == 26 |
| >>> assert len(set(lower)) == 26 |
| >>> assert list(lower) == sorted(list(lower)) |
| >>> assert lower.isalpha() |
| |
| And then the whole alphabet. |
| |
| >>> assert len(pibble32_alphabet) == 32 |
| |
| Now we can test the DKIM-ID output, from function ``dkim_id``. |
| |
| >>> from dkim_id import dkim_id |
| |
| By checking that its output is consistent with the pibble32 encoding. |
| |
| >>> @thesis(given, text_message) |
| ... def consistent_output(data) -> None: |
| ... dkimid: str = dkim_id(data) |
| ... assert len(dkimid) == 32, repr(data) |
| ... assert not (set(dkimid) - pibble32_alphabet), repr(data) |
| |
| We can also check that the unpibbled output is the same as the |
| SHA-256 HMAC of the rascal. |
| |
| >>> from dkim_id import unpibble32 |
| >>> from hmac import digest as hmac_digest |
| >>> @thesis(given, text_message) |
| ... def check_hash_digest(data) -> None: |
| ... rascal: bytes = rfc6376_rascal(data)[1] |
| ... digest_e: bytes = hmac_digest(b"", rascal, "sha256")[:160 // 8] |
| ... dkimid: str = dkim_id(data) |
| ... digest_a: bytes = unpibble32(dkimid) |
| ... assert digest_a == digest_e, repr(data) |
| |
| And here are some example outputs for some simple messages. |
| |
| >>> dkim_id(b"") |
| '8fgp2do75oqo6qd08vs4p7dpp1gj4vjn' |
| >>> dkim_id(b"To: You") |
| 'wowc4vvd0ftwm0q24106mldg67komfl0' |
| >>> dkim_id(b"To: You\r\n") |
| 'wowc4vvd0ftwm0q24106mldg67komfl0' |
| >>> dkim_id(b"To: You\r\nFrom: Me") |
| 'kf7f6zxt7w7k1h1lhxmg9mxngkl5vbcm' |
| >>> dkim_id(b"To: You\r\nFrom: Me\r\n\r\nBody") |
| 'xx5nf02ptvv92tt73kg7n7o9o5t4ngvd' |
| >>> dkim_id(b"To: You\r\nFrom: Me\r\n\r\nBody\r\n") |
| 'b752nf3njqs9r5qwmrkh3n2s24y7y33g' |
| """ |
| |
| from typing import Dict, List, Optional, Set, Tuple |
| |
| import dkim_id |
| |
| pools: Dict[Tuple[str, bytes], Set[bytes]] = { |
| ("f1s1pkloj6y0pm596wt60w7gm89324f0", b"to:Value\r\n\r\n."): { |
| b"To: Value\r\n\n.", |
| b"To: Value\r\n\r.", |
| b"To: Value\r\n\r\n.", |
| }, |
| ("fd9ycrwzno0xdvdltob6jn5n1544h6jc", b"to:Value\r\nto:Value\r\n\r\n\r\n"): { |
| b"To: Value\r\nTo: Value", |
| b"To: Value\rTo: Value", |
| b"To: Value\nTo: Value", |
| }, |
| ( |
| "kf0vt2oj7n7cbzjvozfos33rxt38mfnm", |
| b"to:Value\r\nfrom:Value (1)\r\n\r\n\r\n", |
| ): { |
| b"To: Value\rFrom: Value (1)", |
| b"To: Value\nFrom: Value (1)", |
| b"To: Value\r\nFrom: Value (1)", |
| }, |
| ("zzkpgj5mho9twf3717k4nbcg9tfjwk1o", b"to:Value\r\n\r\nFrom: Value (2)"): { |
| b"To: Value\n\rFrom: Value (2)", |
| b"To: Value\n\nFrom: Value (2)", |
| b"To: Value\r\rFrom: Value (2)", |
| }, |
| ("hmwmj7zl2g3ry5rd08vygydfg8qnlzm7", b"to:Value\r\n\r\nTo: Value"): { |
| b"To: Value\r\n\nTo: Value", |
| b"To: Value\r\n\rTo: Value", |
| b"To: Value\r\n\r\nTo: Value", |
| }, |
| ("87yos9kwbf43mvthd0gmpo9cf8vsol41", b"to:Value\r\n\r\n.\r\n"): { |
| b"To: Value\r\n\n.\n", |
| b"To: Value\r\n\r.\r", |
| b"To: Value\r\n\n.\r", |
| b"To: Value\r\n\r.\n", |
| b"To: Value\r\n\r\n.\r", |
| b"To: Value\r\n\r\n.\n", |
| }, |
| ("xgxqnwnd3sp7kjshmkojnz2wblq4r4z4", b"to:Value\r\n\r\n\r\n"): { |
| b"To: Value\r\n\r\n\r\n", |
| b"To: Value\n", |
| b"To: Value\r\n\r\n\r\r", |
| b"To: Value\n.", |
| b"To: Value\r\n.\r", |
| b"To: Value\r\n\r\n\r", |
| b"To: Value\r\n.\n", |
| b"To: Value\r\n\n", |
| b"To: Value\r.\n", |
| b"To: Value\r\n\r\n\n", |
| b"To: Value\r\n\r\n\n\n", |
| b"To: Value\r\n", |
| b"To: Value", |
| b"To: Value\n.\n", |
| b"To: Value\r\n\r", |
| b"To: Value\r\n.", |
| b"To: Value\n.\r", |
| b"To: Value\r.\r", |
| b"To: Value\r\n\r\n\n\r", |
| b"To: Value\r", |
| b"To: Value\r.", |
| }, |
| } |
| |
| Parsed = Tuple[List[List[bytes]], bytes] |
| parses: Dict[str, Tuple[Parsed, Parsed]] = { |
| " starts with continuation\nTo: Value\n\nBody": ( |
| ([[b"To", b" Value\r\n"]], b"Body"), |
| ([[b"to", b"Value\r\n"]], b"Body"), |
| ), |
| "To: Value\n Continuation\n\nBody": ( |
| ([[b"To", b" Value\r\n Continuation\r\n"]], b"Body"), |
| ([[b"to", b"Value Continuation\r\n"]], b"Body"), |
| ), |
| "To: Value\n\tTab Continuation\n\nBody": ( |
| ([[b"To", b" Value\r\n\tTab Continuation\r\n"]], b"Body"), |
| ([[b"to", b"Value Tab Continuation\r\n"]], b"Body"), |
| ), |
| "To: Value\n\fNon-Continuation\n\nBody": ( |
| ( |
| [[b"To", b" Value\r\n"], [b"\x0cNon-Continuation\r\n", b""]], |
| b"Body", |
| ), |
| ([[b"to", b"Value\r\n"], [b"\x0cnon-continuation", b""]], b"Body"), |
| ), |
| "To: Value\n\nBody Three LF\n\n\n": ( |
| ([[b"To", b" Value\r\n"]], b"Body Three LF\r\n\r\n\r\n"), |
| ([[b"to", b"Value\r\n"]], b"Body Three LF\r\n"), |
| ), |
| } |
| |
| |
| def doctests() -> None: |
| from doctest import ELLIPSIS, testmod |
| |
| testmod(dkim_id, optionflags=ELLIPSIS) |
| testmod(__import__("generators"), optionflags=ELLIPSIS) |
| testmod(optionflags=ELLIPSIS) |
| |
| |
| def main() -> None: |
| from os import environ |
| |
| # from pprint import pprint |
| from hypothesis import given |
| from hypothesis.strategies import text |
| |
| if "HTML" in environ: |
| import sys |
| |
| import docutils.core |
| |
| data = docutils.core.publish_string(__doc__, writer_name="html") |
| sys.stdout.buffer.write(data) |
| return |
| |
| def libopendkim_normal(text: str) -> str: |
| i: int = 0 |
| j: Optional[int] = None |
| end: int = len(text) - 1 |
| result: List[str] = [] |
| while i <= end: |
| prev: str |
| if j is None: |
| prev = "\0x00" |
| else: |
| prev = text[j] |
| if (text[i] == "\n") and (prev != "\r"): |
| result.append("\r\n") |
| elif (prev == "\r") and (text[i] != "\n"): |
| result.append("\n") |
| result.append(text[i]) |
| else: |
| result.append(text[i]) |
| i += 1 |
| j = i - 1 |
| if j is not None: |
| if text[j] == "\r": |
| result.append("\n") |
| return "".join(result) |
| |
| # We use text because its alphabet can be constrained |
| @given(text(alphabet="\t\r\n\f:.")) |
| def test_normal_equality(text: str) -> None: |
| a: bytes = bytes(libopendkim_normal(text), "ascii") |
| b: bytes = dkim_id.rfc5322_endings(bytes(text, "ascii")) |
| assert a == b, repr(text) |
| |
| test_normal_equality() |
| |
| pid: str |
| msg: bytes |
| if False: |
| pooled: Dict[Tuple[str, bytes], Set[bytes]] = {} |
| for pid in pools: |
| for msg in pools[pid]: |
| got: str = dkim_id.dkim_id(msg) |
| ras: bytes = dkim_id.rfc6376_rascal(msg)[1] |
| if (got, ras) not in pooled: |
| pooled[(got, ras)] = set() |
| pooled[(got, ras)].add(msg) |
| print("{") |
| for key in sorted(pooled): |
| print(repr(key) + ":", pooled[key], end=",\n") |
| print("}") |
| return |
| for (pid, rascal) in pools: |
| for msg in pools[(pid, rascal)]: |
| pid_: str = dkim_id.dkim_id(msg) |
| rascal_: bytes = dkim_id.rfc6376_rascal(msg)[1] |
| assert pid == pid_, f"{pid} != {pid_}" |
| assert rascal == rascal_, f"{rascal} != {rascal_}" |
| |
| doctests() |
| print("ok") |
| |
| |
| if __name__ == "__main__": |
| main() |