| <?xml version="1.0" encoding="utf-8" ?> |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
| <meta name="generator" content="Docutils 0.16: http://docutils.sourceforge.net/" /> |
| <title>DKIM-IDs</title> |
| <style type="text/css"> |
| |
| /* |
| :Author: David Goodger (goodger@python.org) |
| :Id: $Id: html4css1.css 7952 2016-07-26 18:15:59Z milde $ |
| :Copyright: This stylesheet has been placed in the public domain. |
| |
| Default cascading style sheet for the HTML output of Docutils. |
| |
| See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to |
| customize this style sheet. |
| */ |
| |
| /* used to remove borders from tables and images */ |
| .borderless, table.borderless td, table.borderless th { |
| border: 0 } |
| |
| table.borderless td, table.borderless th { |
| /* Override padding for "table.docutils td" with "! important". |
| The right padding separates the table cells. */ |
| padding: 0 0.5em 0 0 ! important } |
| |
| .first { |
| /* Override more specific margin styles with "! important". */ |
| margin-top: 0 ! important } |
| |
| .last, .with-subtitle { |
| margin-bottom: 0 ! important } |
| |
| .hidden { |
| display: none } |
| |
| .subscript { |
| vertical-align: sub; |
| font-size: smaller } |
| |
| .superscript { |
| vertical-align: super; |
| font-size: smaller } |
| |
| a.toc-backref { |
| text-decoration: none ; |
| color: black } |
| |
| blockquote.epigraph { |
| margin: 2em 5em ; } |
| |
| dl.docutils dd { |
| margin-bottom: 0.5em } |
| |
| object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] { |
| overflow: hidden; |
| } |
| |
| /* Uncomment (and remove this text!) to get bold-faced definition list terms |
| dl.docutils dt { |
| font-weight: bold } |
| */ |
| |
| div.abstract { |
| margin: 2em 5em } |
| |
| div.abstract p.topic-title { |
| font-weight: bold ; |
| text-align: center } |
| |
| div.admonition, div.attention, div.caution, div.danger, div.error, |
| div.hint, div.important, div.note, div.tip, div.warning { |
| margin: 2em ; |
| border: medium outset ; |
| padding: 1em } |
| |
| div.admonition p.admonition-title, div.hint p.admonition-title, |
| div.important p.admonition-title, div.note p.admonition-title, |
| div.tip p.admonition-title { |
| font-weight: bold ; |
| font-family: sans-serif } |
| |
| div.attention p.admonition-title, div.caution p.admonition-title, |
| div.danger p.admonition-title, div.error p.admonition-title, |
| div.warning p.admonition-title, .code .error { |
| color: red ; |
| font-weight: bold ; |
| font-family: sans-serif } |
| |
| /* Uncomment (and remove this text!) to get reduced vertical space in |
| compound paragraphs. |
| div.compound .compound-first, div.compound .compound-middle { |
| margin-bottom: 0.5em } |
| |
| div.compound .compound-last, div.compound .compound-middle { |
| margin-top: 0.5em } |
| */ |
| |
| div.dedication { |
| margin: 2em 5em ; |
| text-align: center ; |
| font-style: italic } |
| |
| div.dedication p.topic-title { |
| font-weight: bold ; |
| font-style: normal } |
| |
| div.figure { |
| margin-left: 2em ; |
| margin-right: 2em } |
| |
| div.footer, div.header { |
| clear: both; |
| font-size: smaller } |
| |
| div.line-block { |
| display: block ; |
| margin-top: 1em ; |
| margin-bottom: 1em } |
| |
| div.line-block div.line-block { |
| margin-top: 0 ; |
| margin-bottom: 0 ; |
| margin-left: 1.5em } |
| |
| div.sidebar { |
| margin: 0 0 0.5em 1em ; |
| border: medium outset ; |
| padding: 1em ; |
| background-color: #ffffee ; |
| width: 40% ; |
| float: right ; |
| clear: right } |
| |
| div.sidebar p.rubric { |
| font-family: sans-serif ; |
| font-size: medium } |
| |
| div.system-messages { |
| margin: 5em } |
| |
| div.system-messages h1 { |
| color: red } |
| |
| div.system-message { |
| border: medium outset ; |
| padding: 1em } |
| |
| div.system-message p.system-message-title { |
| color: red ; |
| font-weight: bold } |
| |
| div.topic { |
| margin: 2em } |
| |
| h1.section-subtitle, h2.section-subtitle, h3.section-subtitle, |
| h4.section-subtitle, h5.section-subtitle, h6.section-subtitle { |
| margin-top: 0.4em } |
| |
| h1.title { |
| text-align: center } |
| |
| h2.subtitle { |
| text-align: center } |
| |
| hr.docutils { |
| width: 75% } |
| |
| img.align-left, .figure.align-left, object.align-left, table.align-left { |
| clear: left ; |
| float: left ; |
| margin-right: 1em } |
| |
| img.align-right, .figure.align-right, object.align-right, table.align-right { |
| clear: right ; |
| float: right ; |
| margin-left: 1em } |
| |
| img.align-center, .figure.align-center, object.align-center { |
| display: block; |
| margin-left: auto; |
| margin-right: auto; |
| } |
| |
| table.align-center { |
| margin-left: auto; |
| margin-right: auto; |
| } |
| |
| .align-left { |
| text-align: left } |
| |
| .align-center { |
| clear: both ; |
| text-align: center } |
| |
| .align-right { |
| text-align: right } |
| |
| /* reset inner alignment in figures */ |
| div.align-right { |
| text-align: inherit } |
| |
| /* div.align-center * { */ |
| /* text-align: left } */ |
| |
| .align-top { |
| vertical-align: top } |
| |
| .align-middle { |
| vertical-align: middle } |
| |
| .align-bottom { |
| vertical-align: bottom } |
| |
| ol.simple, ul.simple { |
| margin-bottom: 1em } |
| |
| ol.arabic { |
| list-style: decimal } |
| |
| ol.loweralpha { |
| list-style: lower-alpha } |
| |
| ol.upperalpha { |
| list-style: upper-alpha } |
| |
| ol.lowerroman { |
| list-style: lower-roman } |
| |
| ol.upperroman { |
| list-style: upper-roman } |
| |
| p.attribution { |
| text-align: right ; |
| margin-left: 50% } |
| |
| p.caption { |
| font-style: italic } |
| |
| p.credits { |
| font-style: italic ; |
| font-size: smaller } |
| |
| p.label { |
| white-space: nowrap } |
| |
| p.rubric { |
| font-weight: bold ; |
| font-size: larger ; |
| color: maroon ; |
| text-align: center } |
| |
| p.sidebar-title { |
| font-family: sans-serif ; |
| font-weight: bold ; |
| font-size: larger } |
| |
| p.sidebar-subtitle { |
| font-family: sans-serif ; |
| font-weight: bold } |
| |
| p.topic-title { |
| font-weight: bold } |
| |
| pre.address { |
| margin-bottom: 0 ; |
| margin-top: 0 ; |
| font: inherit } |
| |
| pre.literal-block, pre.doctest-block, pre.math, pre.code { |
| margin-left: 2em ; |
| margin-right: 2em } |
| |
| pre.code .ln { color: grey; } /* line numbers */ |
| pre.code, code { background-color: #eeeeee } |
| pre.code .comment, code .comment { color: #5C6576 } |
| pre.code .keyword, code .keyword { color: #3B0D06; font-weight: bold } |
| pre.code .literal.string, code .literal.string { color: #0C5404 } |
| pre.code .name.builtin, code .name.builtin { color: #352B84 } |
| pre.code .deleted, code .deleted { background-color: #DEB0A1} |
| pre.code .inserted, code .inserted { background-color: #A3D289} |
| |
| span.classifier { |
| font-family: sans-serif ; |
| font-style: oblique } |
| |
| span.classifier-delimiter { |
| font-family: sans-serif ; |
| font-weight: bold } |
| |
| span.interpreted { |
| font-family: sans-serif } |
| |
| span.option { |
| white-space: nowrap } |
| |
| span.pre { |
| white-space: pre } |
| |
| span.problematic { |
| color: red } |
| |
| span.section-subtitle { |
| /* font-size relative to parent (h1..h6 element) */ |
| font-size: 80% } |
| |
| table.citation { |
| border-left: solid 1px gray; |
| margin-left: 1px } |
| |
| table.docinfo { |
| margin: 2em 4em } |
| |
| table.docutils { |
| margin-top: 0.5em ; |
| margin-bottom: 0.5em } |
| |
| table.footnote { |
| border-left: solid 1px black; |
| margin-left: 1px } |
| |
| table.docutils td, table.docutils th, |
| table.docinfo td, table.docinfo th { |
| padding-left: 0.5em ; |
| padding-right: 0.5em ; |
| vertical-align: top } |
| |
| table.docutils th.field-name, table.docinfo th.docinfo-name { |
| font-weight: bold ; |
| text-align: left ; |
| white-space: nowrap ; |
| padding-left: 0 } |
| |
| /* "booktabs" style (no vertical lines) */ |
| table.docutils.booktabs { |
| border: 0px; |
| border-top: 2px solid; |
| border-bottom: 2px solid; |
| border-collapse: collapse; |
| } |
| table.docutils.booktabs * { |
| border: 0px; |
| } |
| table.docutils.booktabs th { |
| border-bottom: thin solid; |
| text-align: left; |
| } |
| |
| h1 tt.docutils, h2 tt.docutils, h3 tt.docutils, |
| h4 tt.docutils, h5 tt.docutils, h6 tt.docutils { |
| font-size: 100% } |
| |
| ul.auto-toc { |
| list-style-type: none } |
| |
| </style> |
| </head> |
| <body> |
| <div class="document" id="dkim-ids"> |
| <h1 class="title">DKIM-IDs</h1> |
| |
| <p>The recommended Ponymail ID generator is the DKIM-ID generator. It |
| simplifies a message using an algorithm based on DKIM relaxed/simple |
| canonicalisation, hashes it with an SHA-256 HMAC, and then encodes the |
| truncated digest using base32 with the custom alphabet <tt class="docutils literal"><span class="pre">0-9</span> <span class="pre">b-d</span> <span class="pre">f-h</span> |
| <span class="pre">j-t</span> <span class="pre">v-z</span></tt> and the padding stripped.</p> |
| <div class="section" id="dkim-ids-test-suite"> |
| <h1>DKIM-IDs test suite</h1> |
| <p>As well as plain Python doctests, we also use the hypothesis package |
| to check properties of the DKIM-ID generator algorithm. This has the |
| advantage of providing a kind of partial specification as well as |
| testing the code. The suite can be run using:</p> |
| <pre class="literal-block"> |
| PYTHONPATH=../tools python3 dkim_id_test.py |
| </pre> |
| <p>And exported to HTML using docutils and the command:</p> |
| <pre class="literal-block"> |
| HTML=1 PYTHONPATH=../tools \ |
| python3 dkim_id_test.py > dkim_id_test.html |
| </pre> |
| <div class="section" id="rfc5322-line-ending-normalisation"> |
| <h2>RFC5322 line ending normalisation</h2> |
| <p>The first step of generating a DKIM-ID is to convert all line endings |
| of the input to CRLF by upgrading bare CR and LF characters.</p> |
| <blockquote> |
| <p>If the message is submitted to the Signer with any local encoding |
| that will be modified before transmission, that modification to |
| canonical [RFC5322] form MUST be done before signing. In particular, |
| bare CR or LF characters (used by some systems as a local line |
| separator convention) MUST be converted to the SMTP-standard CRLF |
| sequence before the message is signed.</p> |
| <p><a class="reference external" href="https://tools.ietf.org/html/rfc6376#section-5.3">https://tools.ietf.org/html/rfc6376#section-5.3</a></p> |
| </blockquote> |
| <p>We follow the algorithm used in dkim_header in dkim.c in version 2.10 |
| of libopendkim, the implementation of which is this, reformatted for |
| brevity:</p> |
| <pre class="literal-block"> |
| for (p = hdr; p < q && *p != '\0'; p++) { |
| if (*p == '\n' && prev != '\r') { /* bare LF */ |
| dkim_dstring_catn(tmphdr, CRLF, 2); |
| } else if (prev == '\r' && *p != '\n') { /* bare CR */ |
| dkim_dstring_cat1(tmphdr, '\n'); |
| dkim_dstring_cat1(tmphdr, *p); |
| } else { /* other */ |
| dkim_dstring_cat1(tmphdr, *p); |
| } |
| prev = *p; |
| } |
| if (prev == '\r') { /* end CR */ |
| dkim_dstring_cat1(tmphdr, '\n'); |
| } |
| </pre> |
| <p>Our version of this algorithm is called <tt class="docutils literal">rfc5322_endings</tt>.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc5322_endings |
| </pre> |
| <p>It works on bytes and produces bytes.</p> |
| <p>We test properties of the DKIM-ID related functions not by formally |
| proving them, as there are no mainstream frameworks for formal |
| verification of Python (though Nagini may be worth trying), but |
| instead by fuzzing with hypothesis as a property checker.</p> |
| <pre class="doctest-block"> |
| >>> from hypothesis import given |
| >>> from hypothesis.strategies import from_regex as regex, text |
| </pre> |
| <p>The regex producer outputs str instances, and we use it because |
| hypothesis does not allow us to use patterns or other smart generation |
| with only bytes. Therefore we use the smart str generators and then |
| convert the output to bytes using cp1252 or utf-8 encoding as |
| necessary.</p> |
| <pre class="doctest-block"> |
| >>> def cp1252(text: str) -> bytes: |
| ... return bytes(text, "cp1252") |
| >>> def utf8(text: str): |
| ... return bytes(text, "utf-8") |
| </pre> |
| <p>We'll also use our own decorator to make tests run automatically.</p> |
| <pre class="doctest-block"> |
| >>> def thesis(hypo, *args): |
| ... def decorator(func): |
| ... func = hypo(*args)(func) |
| ... func() |
| ... return func |
| ... return decorator |
| </pre> |
| <p>Since <tt class="docutils literal">rfc5322_endings</tt> only converts endings, sequences containing |
| neither CR nor LF are unaffected.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, regex(r"\A[^\r\n]*\Z")) |
| ... def non_cr_lf_unaffected(text: str) -> None: |
| ... data: bytes = utf8(text) |
| ... assert data == rfc5322_endings(data), repr(data) |
| </pre> |
| <p>The algorithm is that any LF not preceded with CR will have one |
| inserted before it, and likewise for CR not followed by LF. Therefore |
| we expect the result to always have the same number of CR and LFs.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def cr_lf_same_cardinality(text: str) -> None: |
| ... data: bytes = rfc5322_endings(utf8(text)) |
| ... crs = data.count(b"\r") |
| ... lfs = data.count(b"\n") |
| ... assert crs == lfs, repr(data) |
| </pre> |
| <p>That the number of CRs or LFs will never be reduced.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def cr_lf_no_reduce(text: str) -> None: |
| ... a: bytes = utf8(text) |
| ... b: bytes = rfc5322_endings(a) |
| ... assert b.count(b"\r") >= a.count(b"\r"), repr(data) |
| ... assert b.count(b"\n") >= a.count(b"\n"), repr(data) |
| </pre> |
| <p>That if we delete all CRLF subsequences, there will be no CR or LFs |
| remaining in the sequence.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def only_crlf_subsequences(text: str) -> None: |
| ... data: bytes = rfc5322_endings(utf8(text)) |
| ... data = data.replace(b"\r\n", b".") |
| ... assert data.count(b"\r") == 0, repr(data) |
| ... assert data.count(b"\n") == 0, repr(data) |
| </pre> |
| <p>That if we split on CR or LF sequences, the input and output will be |
| the same.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text(alphabet="\r\nabc. ")) |
| ... def non_crlf_subsequences(text: str) -> None: |
| ... def split(data: bytes): |
| ... data = data.replace(b"\r", b"\n") |
| ... while b"\n\n" in data: |
| ... data = data.replace(b"\n\n", b"\n") |
| ... return data.strip(b"\n").split(b"\n") |
| ... data: bytes = utf8(text) |
| ... expected = split(data) |
| ... normed: bytes = rfc5322_endings(data) |
| ... assert split(normed) == expected, repr(data) |
| </pre> |
| <p>And that all of this is equivalent to saying that every CR is now |
| followed by LF and every LF is preceded by CR.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def cr_and_lf_pairs(text: str) -> None: |
| ... data: bytes = rfc5322_endings(utf8(text)) |
| ... if b"\r" in data: |
| ... datum: bytes |
| ... for datum in data.split(b"\r")[1:]: |
| ... assert datum.startswith(b"\n"), repr(data) |
| ... if b"\n" in data: |
| ... datum: bytes |
| ... for datum in data.split(b"\n")[:-1]: |
| ... assert datum.endswith(b"\r"), repr(data) |
| </pre> |
| <p>Most importantly, the number of CRLFs in the output must be equal to |
| the number of CRLFs in the input, plus the number of individual CRs |
| and LFs once the CRLFs have been removed.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text(alphabet="\r\n.")) |
| ... def crlf_count(text: str) -> None: |
| ... nocrlf = text.replace("\r\n", "") |
| ... expected = text.count("\r\n") |
| ... expected += nocrlf.count("\r") |
| ... expected += nocrlf.count("\n") |
| ... data: bytes = rfc5322_endings(utf8(text)) |
| ... assert data.count(b"\r\n") == expected, repr(text) |
| </pre> |
| <p>We'll now give a few examples. First, with no CR or LF.</p> |
| <pre class="doctest-block"> |
| >>> rfc5322_endings(b"") |
| b'' |
| >>> rfc5322_endings(b"abc") |
| b'abc' |
| </pre> |
| <p>All of the following are equivalent to CRLF.</p> |
| <pre class="doctest-block"> |
| >>> rfc5322_endings(b"\r") |
| b'\r\n' |
| >>> rfc5322_endings(b"\n") |
| b'\r\n' |
| >>> rfc5322_endings(b"\r\n") |
| b'\r\n' |
| </pre> |
| <p>And the following are equivalent to CRLF CRLF.</p> |
| <pre class="doctest-block"> |
| >>> rfc5322_endings(b"\r\r") |
| b'\r\n\r\n' |
| >>> rfc5322_endings(b"\n\n") |
| b'\r\n\r\n' |
| >>> rfc5322_endings(b"\n\r") |
| b'\r\n\r\n' |
| </pre> |
| </div> |
| <div class="section" id="dkim-relaxed-head-canonicalisation"> |
| <h2>DKIM relaxed head canonicalisation</h2> |
| <p>The next important component of DKIM-ID generation is DKIM head |
| canonicalisation using the relaxed canonicalisation algorithm. The |
| algorithm is not trivial, consisting of five separate steps:</p> |
| <blockquote> |
| <ul class="simple"> |
| <li>Convert all header field names (not the header field values) to |
| lowercase. For example, convert "SUBJect: AbC" to "subject: AbC".</li> |
| <li>Unfold all header field continuation lines as described in |
| [RFC5322]; in particular, lines with terminators embedded in |
| continued header field values (that is, CRLF sequences followed by |
| WSP) MUST be interpreted without the CRLF. Implementations MUST |
| NOT remove the CRLF at the end of the header field value.</li> |
| <li>Convert all sequences of one or more WSP characters to a single SP |
| character. WSP characters here include those before and after a |
| line folding boundary.</li> |
| <li>Delete all WSP characters at the end of each unfolded header field |
| value.</li> |
| <li>Delete any WSP characters remaining before and after the colon |
| separating the header field name from the header field value. The |
| colon separator MUST be retained.</li> |
| </ul> |
| <p><a class="reference external" href="https://tools.ietf.org/html/rfc6376#section-3.4.2">https://tools.ietf.org/html/rfc6376#section-3.4.2</a></p> |
| </blockquote> |
| <p>We'll use hypothesis to check each of these properties in turn. The |
| canonicalisation function is called <tt class="docutils literal">rfc6376_relaxed_head</tt>.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_relaxed_head |
| </pre> |
| <p>And to test it, we'll need the lists producer from hypothesis.</p> |
| <pre class="doctest-block"> |
| >>> from hypothesis.strategies import lists |
| >>> chars = text(alphabet="\x00\t\r\n\f .ABCabc\xc0").map(cp1252) |
| >>> headers = lists(lists(chars, min_size=2, max_size=2)) |
| </pre> |
| <div class="section" id="step-one"> |
| <h3>Step one</h3> |
| <p>Step one is to convert header field names only to lowercase. Since |
| other normalisation steps will occur, to test it we need to take only |
| the alphabetical octets.</p> |
| <pre class="doctest-block"> |
| >>> def alphabetical(data: bytes) -> bytes: |
| ... from typing import Set |
| ... upper: bytes = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| ... alpha: Set[int] = set(upper + upper.lower()) |
| ... return bytes([b for b in data if b in alpha]) |
| </pre> |
| <p>Then we can make a direct comparison.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, headers) |
| ... def step_1_field_names_lower(headers) -> None: |
| ... ks = [alphabetical(kv[0]) for kv in headers] |
| ... for i, (k, v) in enumerate(rfc6376_relaxed_head(headers)): |
| ... assert ks[i].lower() == alphabetical(k), repr(headers) |
| </pre> |
| <p>Including that values use the same case.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, headers) |
| ... def step_1_field_values_case(headers) -> None: |
| ... vs = [kv[1] for kv in headers] |
| ... alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| ... cases = set(alpha + alpha.lower()) |
| ... for i, (k, v) in enumerate(rfc6376_relaxed_head(headers)): |
| ... assert (set(vs[i]) & cases) == (set(v) & cases), repr(headers) |
| </pre> |
| </div> |
| <div class="section" id="step-two"> |
| <h3>Step two</h3> |
| <p>Step two is to unfold continuations by removing CRLF except at the |
| end. This would only produce consistent results if the value is in |
| <tt class="docutils literal">rfc5322_endings</tt> normal form, so we extend the step to remove all |
| CR or LF, except for a trailing CRLF in the header field value.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_relaxed_head([[b"", b"\r"]]) |
| [[b'', b'']] |
| >>> rfc6376_relaxed_head([[b"", b"\n"]]) |
| [[b'', b'']] |
| >>> rfc6376_relaxed_head([[b"", b"\r\n"]]) |
| [[b'', b'\r\n']] |
| >>> rfc6376_relaxed_head([[b"", b"...\r"]]) |
| [[b'', b'...']] |
| >>> rfc6376_relaxed_head([[b"", b"...\n"]]) |
| [[b'', b'...']] |
| >>> rfc6376_relaxed_head([[b"", b"...\r\n"]]) |
| [[b'', b'...\r\n']] |
| >>> rfc6376_relaxed_head([[b"", b"a\rb\r\n"]]) |
| [[b'', b'ab\r\n']] |
| >>> rfc6376_relaxed_head([[b"", b"a\nb\r\n"]]) |
| [[b'', b'ab\r\n']] |
| >>> rfc6376_relaxed_head([[b"", b"a\r\nb\r\n"]]) |
| [[b'', b'ab\r\n']] |
| </pre> |
| <p>We do this even though, for example, <tt class="docutils literal">b"a\r\nb\r\n"</tt> is not a |
| possible header field value because the first CRLF is not followed by |
| a space or a tab, meaning that it is not a continuation.</p> |
| <p>We apply the CR and LF removal to header field names too, following |
| libopendkim, although <tt class="docutils literal">rfc6376_relaxed_head</tt> should never encounter |
| CR or LF in a header field name during DKIM-ID generation. The removal |
| of CR and LF in header names includes CRLF at the end of a header |
| field name, unlike in a header field value where trailing CRLF is |
| retained.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_relaxed_head([[b"...\r\n", b""]]) |
| [[b'...', b'']] |
| </pre> |
| <pre class="doctest-block"> |
| >>> header_text = (text(alphabet="\x00\t\r\n\f .ABCabc\xc0") |
| ... .map(cp1252) |
| ... .map(rfc5322_endings)) |
| >>> wild_headers = lists(lists(header_text, min_size=2, max_size=2)) |
| </pre> |
| <p>The <tt class="docutils literal">wild_headers</tt> producer gives us headers which have not been |
| normalised, and can therefore be used to test the extended step, |
| e.g. for CR and LF deletion.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def step_2_cr_lf_deletion(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert b"\r" not in k, repr(headers) |
| ... assert b"\n" not in k, repr(headers) |
| ... if v.endswith(b"\r\n"): |
| ... v = v[:-2] |
| ... assert b"\r" not in v, repr(headers) |
| ... assert b"\n" not in v, repr(headers) |
| </pre> |
| <p>We can also test that any trailing CRLF in a header field value is |
| retained.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def step_2_field_values_trailing_crlf(headers) -> None: |
| ... vs = [kv[1] for kv in headers] |
| ... for i, (k, v) in enumerate(rfc6376_relaxed_head(headers)): |
| ... a = vs[i].endswith(b"\r\n") |
| ... b = v.endswith(b"\r\n") |
| ... assert a == b, repr(headers) |
| </pre> |
| </div> |
| <div class="section" id="step-three"> |
| <h3>Step three</h3> |
| <p>Step three is to reduce all sequences of spaces or tabs to a single |
| space, i.e. all sequences that match <tt class="docutils literal">[ \t]+</tt> must be replaced with |
| <tt class="docutils literal">" "</tt>. The RFC sounds like it's saying that step three should be |
| applied to both names and values, but may regard the issue as moot |
| since WSP is not allowed in header names according to RFC 5322:</p> |
| <blockquote> |
| <p>[...] A field name MUST be composed of printable US-ASCII characters |
| (i.e., characters that have values between 33 and 126, inclusive), |
| except colon.</p> |
| <p><a class="reference external" href="https://tools.ietf.org/html/rfc5322#section-2.2">https://tools.ietf.org/html/rfc5322#section-2.2</a></p> |
| </blockquote> |
| <p>Since RFC 6376 says to convert to RFC 5322 normal form first, that |
| implies removing all characters outside of the range 33 to 126. It is |
| not clear that ignoring characters out of this range, e.g. converting |
| "T\x00o" to "To", has no detrimental security properties. Neither RFC |
| 4409 section 8 nor RFC 6376 section 3.8 and 8 discuss this issue. The |
| latter simply says that "Signers and Verifiers SHOULD take reasonable |
| steps to ensure that the messages they are processing are valid".</p> |
| <p>In any case, libopendkim also doesn't delete all characters outside |
| the range 33 to 126 in header field names. Instead, it deletes only |
| tab, CR, LF, and space. But RFC 6376 also says in step five to delete |
| "any WSP characters remaining before and after the colon", with |
| "remaining" being the operative word here. This suggests that it did |
| consider the earlier step three to apply to headers too, otherwise the |
| WSP characters would not be "remaining" ones. But if it considered the |
| earlier step three to apply to header field names, then it must also |
| consider that there may be spaces and tabs inside header field names |
| even after RFC 5322 normalisation. Hence, we consider that RFC 6376 is |
| primarily suggesting to apply RFC 5322 <em>line ending</em> normalisation, |
| which notably it introduces by saying "in particular" in section |
| 5.3. We also consider that it suggests reducing spaces and tabs to a |
| single space in step three, answering the question of what to do with |
| "T o" (it remains "T o") and "T\x00o" (it remains "T\x00o").</p> |
| <p>In summary, we follow RFC 6376 as literally as possible, contrary to |
| libopendkim in this case, and apply step three to header field names.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_relaxed_head([[b"Spaced \t \t\tKey", b"Value\r\n"]]) |
| [[b'spaced key', b'Value\r\n']] |
| </pre> |
| <p>With this, <tt class="docutils literal">rfc6376_relaxed_head</tt> accepts arbitrary bytes for names |
| and values, and deals with them in a consistent and considered way, |
| including tab and space other values outside 33 to 126. This also |
| includes retaining colon and semicolon, even though they are |
| problematic in DKIM signing.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_relaxed_head([[b":", b"Value\r\n"]]) |
| [[b':', b'Value\r\n']] |
| >>> rfc6376_relaxed_head([[b";", b"Value\r\n"]]) |
| [[b';', b'Value\r\n']] |
| </pre> |
| <p>In the component of the DKIM-ID generator which uses header |
| canonicalisation it's impossible for it to have colon in the header |
| name, but it is possible for it to have semicolon. Such a header could |
| not be signed using DKIM as it uses semicolon as the separator in the |
| list of headers which have been signed, but it will be ignored in |
| DKIM-ID generation as long as the defaults are followed or <tt class="docutils literal">";"</tt> is |
| not manually specified as a subset header to keep. Another problematic |
| header which is possible is the empty header. The case of a header |
| name starting with WSP also doesn't arise, because such lines are |
| continuation lines.</p> |
| <p>Overall, there should never be a tab in canonicalised header field |
| names and values, and there should never be a double space in |
| canonicalised header field names and values.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def step_3_field_values(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert b"\t" not in k, repr(headers) |
| ... assert b"\t" not in v, repr(headers) |
| ... assert b" " not in k, repr(headers) |
| ... assert b" " not in v, repr(headers) |
| </pre> |
| <p>Internally, the function that performs this step is called |
| <tt class="docutils literal">rfc6376_shrink_head</tt>.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_shrink_head |
| </pre> |
| <p>And it should work like a more efficient version of iteratively |
| removing double spaces, except that it also strips leading and |
| trailing whitespace, which is for steps four and five.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def step_3_reduce_iterative(headers) -> None: |
| ... for (k, v) in headers: |
| ... kk = k.replace(b"\t", b" ") |
| ... vv = v.replace(b"\t", b" ") |
| ... while b" " in kk: |
| ... kk = kk.replace(b" ", b" ") |
| ... kk = kk.strip(b" ") |
| ... while b" " in vv: |
| ... vv = vv.replace(b" ", b" ") |
| ... vv = vv.strip(b" ") |
| ... assert rfc6376_shrink_head(k) == kk, repr(k) |
| ... assert rfc6376_shrink_head(v) == vv, repr(v) |
| </pre> |
| <p>This also means that leading whitespace is removed from the beginnings |
| of header names. Again this is not a case which could occur during |
| DKIM-ID generation, in this case because such a name would have been |
| regarded as a continuation, even at the beginning of a message where |
| it is regarded as the continuation of the empty name.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_relaxed_head([[b" Key", b"Value\r\n"]]) |
| [[b'key', b'Value\r\n']] |
| </pre> |
| </div> |
| <div class="section" id="step-four"> |
| <h3>Step four</h3> |
| <p>Step four says that spaces and tabs at the end of a header field value |
| are removed.</p> |
| <p>It is possible to give a header field value without a trailing CRLF to |
| <tt class="docutils literal">rfc6376_relaxed_head</tt>, and so any trailing tabs or spaces there |
| must be removed.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_relaxed_head([[b"", b"Value\t "]]) |
| [[b'', b'Value']] |
| </pre> |
| <p>But the RFC 5322 message grammar states that all headers shall end |
| with CRLF. An overly literal reading of RFC 6376 therefore implies |
| that spaces and tabs are never removed from the end of a field value, |
| because the value must always end with CRLF according to RFC 5322. But |
| if they were never removed then there would be no need for the step, |
| so the implication is that the "end" for the purposes of this step is |
| before the trailing CRLF.</p> |
| <p>A reading of <tt class="docutils literal">dkim_canon_header_string</tt> in libopendkim suggests that |
| it could leave a header ending with space CRLF, but this hasn't been |
| tested. We remove the space correctly.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_relaxed_head([[b"Key", b"Value \r\n"]]) |
| [[b'key', b'Value\r\n']] |
| </pre> |
| <p>Indeed, a header field value must never end with space or tab.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def step_4_field_values_ends(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert not v.endswith(b" "), repr(headers) |
| ... assert not v.endswith(b"\t"), repr(headers) |
| </pre> |
| <p>And must never end with space CRLF or tab CRLF.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def step_4_field_values_ends_2(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert not v.endswith(b" \r\n"), repr(headers) |
| ... assert not v.endswith(b"\t\r\n"), repr(headers) |
| </pre> |
| <p>Indeed, it should never be possible to contain, let alone end, with a |
| tab anyway after step three since that replaces all sequences of |
| spaces and tabs with a single space, leaving no tabs at all in the |
| output before it reaches step four.</p> |
| </div> |
| <div class="section" id="step-five"> |
| <h3>Step five</h3> |
| <p>Step five is to remove spaces and tabs from the end of header names, |
| and from the start of header values. Again, all tabs should have been |
| removed anyway in step three, so this step could have specified only |
| removing spaces.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def step_5_wsp_around_colon(headers) -> None: |
| ... for (k, v) in rfc6376_relaxed_head(headers): |
| ... assert not k.endswith(b" "), repr(headers) |
| ... assert not k.endswith(b"\t"), repr(headers) |
| ... assert not v.startswith(b" "), repr(headers) |
| ... assert not v.startswith(b"\t"), repr(headers) |
| </pre> |
| </div> |
| <div class="section" id="general-properties"> |
| <h3>General properties</h3> |
| <p>We can combine headers in order to check their size.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_join |
| </pre> |
| <p>This can be used to test one of the general properties of |
| <tt class="docutils literal">rfc6376_relaxed_head</tt>, that it never enlarges the data given to it.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def head_never_enlarged(headers) -> None: |
| ... a: bytes = rfc6376_join(headers) |
| ... h: List[List[bytes]] = rfc6376_relaxed_head(headers) |
| ... b: bytes = rfc6376_join(h) |
| ... assert len(a) >= len(b), repr(headers) |
| </pre> |
| <p>Perhaps the most important general property of canonicalisation is |
| that once canonicalised, attempting to canonicalise again produces the |
| same data. In other words canonicalisation is absolute, and data |
| cannot be canonicalised further.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, wild_headers) |
| ... def recanonicalisation_is_identity(headers) -> None: |
| ... a = rfc6376_relaxed_head(headers) |
| ... b = rfc6376_relaxed_head(a) |
| ... assert a == b, repr(headers) |
| </pre> |
| </div> |
| </div> |
| <div class="section" id="simple-body-canonicalisation"> |
| <h2>Simple body canonicalisation</h2> |
| <p>The body canonicalisation function is called <tt class="docutils literal">rfc6376_simple_body</tt>.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_simple_body |
| </pre> |
| <p>It maps an empty body to CRLF, and then ensures that there is at most |
| one CRLF at the end of the body. Therefore, a consequence is that it |
| ensures that the output is never empty.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, chars) |
| ... def body_not_empty(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... assert len(body_c) > 0, repr(body) |
| </pre> |
| <p>And that the output never ends CRLF CRLF.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, chars) |
| ... def body_no_trailing_crlfcrlf(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... assert not body_c.endswith(b"\r\n\r\n") > 0, repr(body) |
| </pre> |
| <p>But it could end non-CR LF CRLF, or CR CRLF if the input were not RFC |
| 5322 ending normalised.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_simple_body(b"Non-CR\n\r\n") |
| b'Non-CR\n\r\n' |
| >>> rfc6376_simple_body(b"CR\r\r\n") |
| b'CR\r\r\n' |
| </pre> |
| <p>The function enlarges data only when its input is empty.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, chars.filter(lambda b: b != b"")) |
| ... def body_enlarging_edge(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... assert len(body_c) <= len(body), repr(body) |
| </pre> |
| <p>The prefix of the output up to any trailing CRLF the shared by the input.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, chars) |
| ... def body_same_prefix(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... size_c = len(body_c) |
| ... if body_c.endswith(b"\r\n"): |
| ... size_c -= 2 |
| ... assert body[:size_c] == body_c[:size_c], repr(body) |
| </pre> |
| <p>And any remainder must consist solely of CRLFs in both input and output.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, chars) |
| ... def body_suffix_crlfs(body) -> None: |
| ... body_c = rfc6376_simple_body(body) |
| ... size_c = len(body_c) |
| ... if body_c.endswith(b"\r\n"): |
| ... size_c -= 2 |
| ... assert not body[size_c:].replace(b"\r\n", b""), repr(body) |
| ... assert not body_c[size_c:].replace(b"\r\n", b""), repr(body) |
| </pre> |
| </div> |
| <div class="section" id="splitting"> |
| <h2>Splitting</h2> |
| <p>The main parser is called <tt class="docutils literal">rfc6376_split</tt>.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_split |
| </pre> |
| <p>It does not perform canonicalisation. If there is no CRLF header and |
| body boundary separator, then it returns None for the body.</p> |
| <p>Each header field is defined by RFC 5322 as ending with CRLF which is |
| inclusive to that header field. Any CRLF following that indicates the |
| start of a body, which may be empty. Therefore, in the case of the |
| empty document there are no headers and no body.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"") |
| ([], None) |
| </pre> |
| <p>In the case of just CRLF there are no headers, since they must contain |
| at least one character before their CRLF. RFC 5322 section 2.2 says |
| that header fields "are lines beginning with a field name, followed by |
| a colon", which implies at least the presence of a colon, and section |
| 3.6.8 says "field-name = 1*ftext" which means the name must include at |
| least one printable character. As there is nothing after the CRLF in |
| the case of just a CRLF, there is an empty body.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"\r\n") |
| ([], b'') |
| </pre> |
| <p>In the case of CRLF CRLF there are no headers, and there is a body |
| which is CRLF.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"\r\n\r\n") |
| ([], b'\r\n') |
| </pre> |
| <p>And then this pattern repeats.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"\r\n\r\n\r\n") |
| ([], b'\r\n\r\n') |
| >>> rfc6376_split(b"\r\n\r\n\r\n\r\n") |
| ([], b'\r\n\r\n\r\n') |
| </pre> |
| <p>When we have a header, a single trailing CRLF is regarded as part of |
| that header. This means that there is no body.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"Key:Value\r\n") |
| ([[b'Key', b'Value\r\n']], None) |
| </pre> |
| <p>But appending another CRLF to that gives an empty body.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"Key:Value\r\n\r\n") |
| ([[b'Key', b'Value\r\n']], b'') |
| </pre> |
| <p>As <tt class="docutils literal">rfc6376_split</tt> does not perform canonicalisation, we have the |
| edge cases of isolated CRs and LFs. There should never be isolated CRs |
| and LFs in DKIM-ID generation because RFC 5322 ending normalisation is |
| applied before splitting, but in such cases where the function is |
| called with isolated CRs and LFs they are considered as header field |
| name or header field value data.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"\r") |
| ([[b'\r', b'']], None) |
| >>> rfc6376_split(b"\n") |
| ([[b'\n', b'']], None) |
| >>> rfc6376_split(b"\n\r\n") |
| ([[b'\n', b'\r\n']], None) |
| >>> rfc6376_split(b"\r\r\n") |
| ([[b'\r', b'\r\n']], None) |
| >>> rfc6376_split(b"\r...\r\n") |
| ([[b'\r...', b'\r\n']], None) |
| >>> rfc6376_split(b"\n...\r\n") |
| ([[b'\n...', b'\r\n']], None) |
| >>> rfc6376_split(b"\n:\n\r\n") |
| ([[b'\n', b'\n\r\n']], None) |
| >>> rfc6376_split(b"\n...:\n...\r\n") |
| ([[b'\n...', b'\n...\r\n']], None) |
| </pre> |
| <p>A header field name without any header field value is just regarded as |
| being the same as one with an empty value.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"Key\r\n\r\n") |
| ([[b'Key', b'\r\n']], b'') |
| >>> rfc6376_split(b"Key:\r\n\r\n") |
| ([[b'Key', b'\r\n']], b'') |
| </pre> |
| <p>For greater consistency with how bodies are handled, the former could |
| have been interpreted as <tt class="docutils literal">[b'Key', None]</tt>, but this would increase |
| the complexity of the code, and lead to the question of where the |
| trailing CRLF ought to be stored.</p> |
| <p>In some cases, one of the mbox formats may accidentally be passed to |
| <tt class="docutils literal">rfc6376_split</tt>, containing a line like this in its headers, usually |
| at the start but potentially later in the headers too:</p> |
| <blockquote> |
| "From MAILER-DAEMON Fri Jul 8 12:08:34 2011"</blockquote> |
| <p>Which would be interpreted as a header field whose name is:</p> |
| <blockquote> |
| "From MAILER-DAEMON Fri Jul 8 12"</blockquote> |
| <p>And which could also collect any following continuation line.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b"To:You\r\nFrom Me\r\n More\r\n") |
| ([[b'To', b'You\r\n'], [b'From Me', b'\r\n More\r\n']], None) |
| </pre> |
| <p>This is safe because even after canonicalisation it is not possible to |
| confuse a <tt class="docutils literal">"From "</tt> line with a <tt class="docutils literal">"From:"</tt> header field, unless no |
| text follows the <tt class="docutils literal">"From "</tt> and it is followed by a continuation. If |
| no text follows the <tt class="docutils literal">"From "</tt> then it is not in one of the mbox |
| formats anyway. And if it is followed by a continuation, then |
| interpreting it as a From header field is reasonable.</p> |
| <p>Similarly to a name without a value, a continuation value without a |
| preceding line is treated as though the header field name is empty.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split(b" More\r\n") |
| ([[b'', b' More\r\n']], None) |
| </pre> |
| <p>An alternative to this would be to treat the line itself as a header |
| field name, but then that creates the issue of whether to remove the |
| leading whitespace, and whether to parse a colon in it. It would also |
| make it inconsistent with all other field names, which must not start |
| with a space.</p> |
| <p>The type of the body, the second element of the tuple returned from |
| <tt class="docutils literal">rfc6376_split</tt>, directly correlates to whether the input starts |
| with CRLF or whether CRLF CRLF occurs in the input. If it does so, |
| then we say that the input message contains a header and body |
| boundary.</p> |
| <pre class="doctest-block"> |
| >>> def contains_boundary(data: bytes) -> bool: |
| ... return data.startswith(b"\r\n") or (b"\r\n\r\n" in data) |
| </pre> |
| <p>We use a simple subset of all possible inputs to check this |
| correlation.</p> |
| <pre class="doctest-block"> |
| >>> text_message = (text(alphabet="\x00\t\r\n\f .:ABCabc\xc0") |
| ... .map(cp1252)) |
| </pre> |
| <p>Although <tt class="docutils literal">rfc6376_split</tt> should always take input in RFC 5322 ending |
| normal form, we test without that normal form.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text_message) |
| ... def body_type_correlation(data) -> None: |
| ... headers, body = rfc6376_split(data) |
| ... body_not_none = (body is not None) |
| ... assert contains_boundary(data) is body_not_none, repr(data) |
| </pre> |
| <p>If the input is not RFC 5322 normalised, then CR and LF can appear in |
| header field names, as already demonstrated. Colon, however, should |
| never appear in a header field name.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text_message) |
| ... def no_split_colon(data) -> None: |
| ... headers, body = rfc6376_split(data) |
| ... for (k, v) in headers: |
| ... assert b":" not in k, repr(data) |
| </pre> |
| <p>And if the input is RFC 5322 normalised, then colon, CR, and LF should |
| never appear in header field names.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text_message) |
| ... def no_normal_split_chars(data) -> None: |
| ... data = rfc5322_endings(data) |
| ... headers, body = rfc6376_split(data) |
| ... for (k, v) in headers: |
| ... assert b":" not in k, repr(data) |
| ... assert b"\r" not in k, repr(data) |
| ... assert b"\n" not in k, repr(data) |
| </pre> |
| </div> |
| <div class="section" id="canonicalised-splitting"> |
| <h2>Canonicalised splitting</h2> |
| <p>The version of the main parser which performs canonicalisation is |
| called <tt class="docutils literal">rfc6376_split_canon</tt>.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_split_canon |
| </pre> |
| <p>It takes <tt class="docutils literal">head_subset</tt>, <tt class="docutils literal">head_canon</tt>, and <tt class="docutils literal">body_canon</tt> |
| arguments. The first is a set of bytes, lower case header field names |
| to keep when parsing the headers. If <tt class="docutils literal">head_subset</tt> is None, all |
| headers are retained, which is useful for testing. The second is a |
| boolean of whether to apply <tt class="docutils literal">rfc6376_relaxed_head</tt>, and the third is |
| a boolean of whether to apply <tt class="docutils literal">rfc6376_simple_body</tt> and potentially |
| modify the headers too for consistency.</p> |
| <p>If there was no body, i.e. no header body boundary CRLF in the |
| message, then the returned body should be <tt class="docutils literal">None</tt> rather than |
| <tt class="docutils literal">b""</tt>.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text_message) |
| ... def body_none(message) -> None: |
| ... boundary = contains_boundary(rfc5322_endings(message)) |
| ... headers, body = rfc6376_split_canon(message) |
| ... assert boundary is (body is not None), repr(message) |
| </pre> |
| <p>We can perform the canonicalisation steps ourselves. We need to import |
| <tt class="docutils literal">rfc6376_simple_holistic</tt>, which ensures that headers are augmented |
| with CRLF if necessary when there is either no body or an empty body |
| but body canonicalisation synthesizes one.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_simple_holistic |
| </pre> |
| <p>And then DKIM relaxed/simple can be applied consistently.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text_message) |
| ... def manual_canon(message) -> None: |
| ... # uc = uncanonicalised, ec = expected canon, ac = actual canon |
| ... headers_uc, body_uc = rfc6376_split_canon(message) |
| ... headers_ec, body_ec = rfc6376_split_canon(message, |
| ... head_canon=True, body_canon=True) |
| ... headers_ac = rfc6376_relaxed_head(headers_uc) |
| ... headers_ac, body_ac = rfc6376_simple_holistic(headers_ac, body_uc) |
| ... assert headers_ac == headers_ec, repr(message) |
| ... assert body_ac == body_ec, repr(message) |
| </pre> |
| <p>The header and body canonicalisation steps are optional. Even when |
| retaining all headers (which is the default) and performing neither |
| kind of canonicalisation (which is also the default), the input |
| message is not necessarily the same as the output message, whether RFC |
| 5322 normalisation were performed or not. This is because, for |
| example, the construction of broken headers, i.e. those without |
| colons, is fixed in the process.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_split_canon(b"Key") |
| ([[b'Key', b'']], None) |
| >>> rfc6376_join(*rfc6376_split_canon(b"Key")) |
| b'Key:' |
| </pre> |
| </div> |
| <div class="section" id="reformation"> |
| <h2>Reformation</h2> |
| <p>We call the process of splitting and then joining "reforming". There |
| is a function called <tt class="docutils literal">rfc6376_reformed</tt> that performs this.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_reformed |
| </pre> |
| <p>Then <tt class="docutils literal">rfc6376_reformed</tt> should be exactly equivalent to using |
| <tt class="docutils literal">rfc6376_split</tt> and then <tt class="docutils literal">rfc6376_join</tt>.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text_message) |
| ... def normal(message) -> None: |
| ... a = rfc6376_join(*rfc6376_split(message)) |
| ... b = rfc6376_reformed(message) |
| ... assert a == b, repr(message) |
| </pre> |
| </div> |
| <div class="section" id="canonicalised-reformation"> |
| <h2>Canonicalised reformation</h2> |
| <p>We can use <tt class="docutils literal">rfc6376_reformed_canon</tt> to canonicalise a message whilst |
| reforming it. This function accepts an additional <tt class="docutils literal">lid</tt> parameter to |
| specify a list ID, in the RFC 2919 sense, and returns a list ID and |
| the canonicalised message. The output list ID will be an empty bytes |
| object if the input list ID was in any <tt class="docutils literal"><span class="pre">List-Id</span></tt> header in the input |
| message.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_reformed_canon |
| </pre> |
| <p>Then if we make our own headers, canonicalise them, and then join |
| them, we should always get a canonicalised message.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, headers) |
| ... def more_manual_canon(headers) -> None: |
| ... headers_c = rfc6376_relaxed_head(headers) |
| ... message_c = rfc6376_join(headers_c) |
| ... assert message_c == rfc6376_reformed_canon(message_c, |
| ... head_canon=True, body_canon=False)[1], repr(message_c) |
| </pre> |
| </div> |
| <div class="section" id="rascals"> |
| <h2>Rascals</h2> |
| <p>DKIM-ID generation uses the standard <tt class="docutils literal">rfc6376_reformed_canon</tt> call |
| with <tt class="docutils literal">rfc4871_subset</tt> headers and both head and body |
| canonicalised. We refer to this combination as <em>reformed and |
| relaxed/simple canonicalisation</em>, or just "rascal" for short. The |
| function that performs this is called <tt class="docutils literal">rfc6376_rascal</tt>. Like |
| <tt class="docutils literal">rfc6376_reformed_canon</tt>, this function accepts an additional |
| <tt class="docutils literal">lid</tt> parameter to specify a list ID, in the RFC 2919 sense, and |
| returns a list ID and the canonicalised message.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc6376_rascal |
| </pre> |
| <p>A missing or empty body is encoded, per RFC 6376 simple body |
| canonicalisation, as CRLF. We always perform body canonicalisation if |
| <tt class="docutils literal">body_canon</tt> is <tt class="docutils literal">True</tt>, which means that even if there is no body |
| (i.e. there was no header and body boundary in the original) there |
| will always be body canonicalisation, which means that the body will |
| always be non-empty, and will always be appended by <tt class="docutils literal">rfc6376_join</tt> |
| after the header and body separator CRLF. This means that there will |
| always be a header and body boundary in the rascal output.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text_message) |
| ... def rascal_contains_boundary(data) -> None: |
| ... rascal = rfc6376_rascal(data)[1] |
| ... assert contains_boundary(rascal), repr(data) |
| </pre> |
| <p>In particular, it means that the empty input document will become CRLF |
| CRLF, which is the header and body separator CRLF followed by the |
| canonicalised empty body CRLF. Two CRLFs, but with completely |
| different roles.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_rascal(b"") |
| (b'', b'\r\n\r\n') |
| </pre> |
| <p>And, because trailing CRs or LFs are RFC 5322 ending normalised and |
| then canonicalised to a single CRLF, it means that any sequence of CRs |
| or LFs will be rascaled to CRLF CRLF too.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text(alphabet="\r\n").map(utf8)) |
| ... def normal_crlfs_to_crlf2(data) -> None: |
| ... rascal = rfc6376_rascal(data)[1] |
| ... assert rascal == b"\r\n\r\n", repr(data) |
| </pre> |
| <p>Since the input is considered to be a message, arbitrary text without |
| metacharacters will usually be regarded as a discardable header field.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_rascal(b"Text") |
| (b'', b'\r\n\r\n') |
| </pre> |
| <p>This is true even when colon is included, as long as the prefix is not |
| one of the standard header field names in <tt class="docutils literal">rfc4871_subset</tt>.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_rascal(b"Discarded: Value") |
| (b'', b'\r\n\r\n') |
| </pre> |
| <p>But if the header is in the subset, it will indeed be retained. In |
| this case, holistic canonicalisation ensures that CRLF is appended to |
| the header too.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_rascal(b"To: Recipient") |
| (b'', b'to:Recipient\r\n\r\n\r\n') |
| </pre> |
| <p>In other words this is a header field <tt class="docutils literal">b'to:Recipient\r\n'</tt>, |
| followed by a CRLF header and body boundary, followed by the CRLF of |
| the canonicalised missing body.</p> |
| <p>If there is no header value for a subset header, then it is treated as |
| if the header value were empty.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_rascal(b"To") |
| (b'', b'to:\r\n\r\n\r\n') |
| >>> rfc6376_rascal(b"To:") |
| (b'', b'to:\r\n\r\n\r\n') |
| </pre> |
| <p>RFC 6376 says that canonicalisation should, obviously, come before |
| signing.</p> |
| <blockquote> |
| <p>Canonicalization simply prepares the email for presentation to the |
| signing or verification algorithm.</p> |
| <p><a class="reference external" href="https://tools.ietf.org/html/rfc6376#section-3.4">https://tools.ietf.org/html/rfc6376#section-3.4</a></p> |
| </blockquote> |
| <p>But a more subtle consequence of this is that subsetting headers also |
| comes after canonicalisation, because subsetting is not part of |
| canonicalisation - it's part of signing.</p> |
| <p>This is important in our expansion of the RFC 6376 algorithm to cover |
| all inputs because e.g. it means that header field names with trailing |
| whitespace are treated the same as without that whitespace.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_rascal(b"To \n") |
| (b'', b'to:\r\n\r\n\r\n') |
| </pre> |
| <p>But a header name with whitespace inside it is not, unlike in the |
| libopendkim algorithm, treated the same as one without whitespace |
| inside it, for reasons already discussed in the documentation of RFC |
| 6376 header canonicalisation step three.</p> |
| <pre class="doctest-block"> |
| >>> rfc6376_rascal(b"T o\n") |
| (b'', b'\r\n\r\n') |
| </pre> |
| </div> |
| <div class="section" id="header-subsetting"> |
| <h2>Header subsetting</h2> |
| <p>We use a subset of headers specified in RFC 4871. We use RFC 4871 even |
| though it was obsoleted by RFC 6376 because the earlier RFC has a more |
| extensive list of headers, and the later RFC says anyway that the |
| choice of which headers to include is a matter of choice dependent on |
| the signing environment. Since DKIM-ID generation does not even |
| include signing, our requirements are somewhat different anyway.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import rfc4871_subset |
| </pre> |
| <p>Whenever the <tt class="docutils literal">rfc4871_subset</tt> headers are specified as the subset to |
| be retained, they should indeed be retained in the output of |
| <tt class="docutils literal">rfc6376_rascal</tt>.</p> |
| <pre class="doctest-block"> |
| >>> for k in rfc4871_subset: |
| ... minimal = k + b":\r\n\r\n\r\n" |
| ... assert minimal == rfc6376_rascal(minimal)[1], repr(minimal) |
| </pre> |
| <p>Though the subset is loosely called the "RFC 4871 subset", there is |
| one header in <tt class="docutils literal">rfc4871_subset</tt> which RFC 4871 doesn't recommend: |
| DKIM-Signature itself.</p> |
| <pre class="doctest-block"> |
| >>> b"dkim-signature" in rfc4871_subset |
| True |
| </pre> |
| <p>We include the DKIM-Signature header field in the subset of retained |
| headers because then if the sender has signed their message it ought |
| to be reflected in the identifier for that message. It would not have |
| made sense for RFC 4817 to recommend that header field for signing |
| input, because it is itself the signing output! But if, for example, |
| there were an widely implemented RFC specifying a precursor to DKIM |
| which was later superseded by DKIM, it is reasonable to assume that |
| RFC 4817 would have recommended including the output of the precursor |
| in the headers to sign, combining the two approaches. Similarly, since |
| DKIM is a precursor to DKIM-ID, DKIM-ID is able to include its output |
| as an input.</p> |
| </div> |
| <div class="section" id="custom-base32-encoding"> |
| <h2>Custom base32 encoding</h2> |
| <p>When we have a canonicalised message with subsetted headers, we take |
| the SHA-256 HMAC digest of that message and then encode a truncated |
| version of it using pibble32, which is base32 with the alphabet <tt class="docutils literal"><span class="pre">0-9</span> |
| <span class="pre">b-d</span> <span class="pre">f-h</span> <span class="pre">j-t</span> <span class="pre">v-z</span></tt>, and remove the padding.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import pibble32 |
| </pre> |
| <p>The alphabet used means that the pibble32 output is always lowercase, |
| and never contains the letters a, e, i, or u.</p> |
| <p>We need the binary producer from hypothesis.</p> |
| <pre class="doctest-block"> |
| >>> from hypothesis.strategies import binary |
| </pre> |
| <p>And then we can test these general properties.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, binary()) |
| ... def pibble32_general(data) -> None: |
| ... encoded = pibble32(data) |
| ... assert encoded == encoded.lower(), repr(data) |
| ... encoded_set = set(encoded) |
| ... assert not (encoded_set & {"a", "e", "i", "u"}), repr(data) |
| </pre> |
| <p>There may be padding, but only when the data length is not divisible |
| by five.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, binary()) |
| ... def pibble32_padding(data) -> None: |
| ... encoded = pibble32(data) |
| ... no_padding = not encoded.endswith("=") |
| ... divisible_by_five = not (len(data) % 5) |
| ... assert no_padding is divisible_by_five, repr(data) |
| </pre> |
| <p>We strip the padding on the DKIM-ID since it is fixed at a width of |
| 128 bits, and the pibble32 output is byte aligned anyway, i.e. the |
| decoder accepts no other padding than "======".</p> |
| <p>The length of the pibble32 output will always be the same as when |
| base32 encoding it.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, binary()) |
| ... def pibble32_length(data) -> None: |
| ... from base64 import b32encode |
| ... assert len(pibble32(data)) == len(b32encode(data)), repr(data) |
| </pre> |
| <p>Here are a some specific examples:</p> |
| <pre class="doctest-block"> |
| >>> pibble32(b"") |
| '' |
| >>> pibble32(b"\x00") |
| '00======' |
| >>> pibble32(b"\x01") |
| '04======' |
| >>> pibble32(b"\x02") |
| '08======' |
| >>> pibble32(b"\xff") |
| 'zw======' |
| >>> pibble32(b"\x00\x00\x00\x00\x00") |
| '00000000' |
| >>> pibble32(b"\x00\x00\x01\x00\x00") |
| '00002000' |
| >>> pibble32(b"\x00\x00\x02\x00\x00") |
| '00004000' |
| >>> pibble32(b"\x00\x00\xff\x00\x00") |
| '000hy000' |
| >>> pibble32(b"\x00\x00\xff\xff\x00") |
| '000hzzr0' |
| >>> pibble32(b"\xff\xff\xff\xff\xff") |
| 'zzzzzzzz' |
| </pre> |
| <p>When the input length is divisible by five, the output length is |
| always 8 / 5 of that length.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, binary()) |
| ... def pibble32_eight_fifths(data) -> None: |
| ... size = len(data) |
| ... resized = size - (size % 5) |
| ... fives = data[:resized] |
| ... assert len(pibble32(fives)) == (resized * 8 / 5), repr(data) |
| </pre> |
| <p>And when it's not divisible by five, the length is rounded up to the |
| next number divisible by five.</p> |
| <p>This means that 160 bits of input is multiplied by 8 / 5, which gives |
| 256 bits, or 32 bytes, of output.</p> |
| <pre class="doctest-block"> |
| >>> 160 * 8 // 5 |
| 256 |
| >>> 256 // 8 |
| 32 |
| </pre> |
| </div> |
| <div class="section" id="dkim-id-generation"> |
| <h2>DKIM-ID generation</h2> |
| <p>Once the rascaled version of the message is obtained, it it hashed and |
| then pibble32 encoded to form the DKIM-ID. We want to check that the |
| output is pibble32 encoded, at least in that its length is correct and |
| its alphabet is a subset of what is expected.</p> |
| <pre class="doctest-block"> |
| >>> digit = "0123456789" |
| >>> lower = "abcdefghijklmnopqrstuvwxyz" |
| >>> pibble32_alphabet = (set(digit) | set(lower)) - {"a", "e", "i", "u"} |
| </pre> |
| <p>We guard against typos in the alphabet by testing expected properties, |
| first by checking the digits.</p> |
| <pre class="doctest-block"> |
| >>> assert len(digit) == 10 |
| >>> assert len(set(digit)) == 10 |
| >>> assert list(digit) == sorted(list(digit)) |
| >>> assert digit.isdigit() |
| </pre> |
| <p>Then the lowercase letters.</p> |
| <pre class="doctest-block"> |
| >>> assert len(lower) == 26 |
| >>> assert len(set(lower)) == 26 |
| >>> assert list(lower) == sorted(list(lower)) |
| >>> assert lower.isalpha() |
| </pre> |
| <p>And then the whole alphabet.</p> |
| <pre class="doctest-block"> |
| >>> assert len(pibble32_alphabet) == 32 |
| </pre> |
| <p>Now we can test the DKIM-ID output, from function <tt class="docutils literal">dkim_id</tt>.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import dkim_id |
| </pre> |
| <p>By checking that its output is consistent with the pibble32 encoding.</p> |
| <pre class="doctest-block"> |
| >>> @thesis(given, text_message) |
| ... def consistent_output(data) -> None: |
| ... dkimid: str = dkim_id(data) |
| ... assert len(dkimid) == 32, repr(data) |
| ... assert not (set(dkimid) - pibble32_alphabet), repr(data) |
| </pre> |
| <p>We can also check that the unpibbled output is the same as the |
| SHA-256 HMAC of the rascal.</p> |
| <pre class="doctest-block"> |
| >>> from dkim_id import unpibble32 |
| >>> from hmac import digest as hmac_digest |
| >>> @thesis(given, text_message) |
| ... def check_hash_digest(data) -> None: |
| ... rascal: bytes = rfc6376_rascal(data)[1] |
| ... digest_e: bytes = hmac_digest(b"", rascal, "sha256")[:160 // 8] |
| ... dkimid: str = dkim_id(data) |
| ... digest_a: bytes = unpibble32(dkimid) |
| ... assert digest_a == digest_e, repr(data) |
| </pre> |
| <p>And here are some example outputs for some simple messages.</p> |
| <pre class="doctest-block"> |
| >>> dkim_id(b"") |
| '8fgp2do75oqo6qd08vs4p7dpp1gj4vjn' |
| >>> dkim_id(b"To: You") |
| 'wowc4vvd0ftwm0q24106mldg67komfl0' |
| >>> dkim_id(b"To: You\r\n") |
| 'wowc4vvd0ftwm0q24106mldg67komfl0' |
| >>> dkim_id(b"To: You\r\nFrom: Me") |
| 'kf7f6zxt7w7k1h1lhxmg9mxngkl5vbcm' |
| >>> dkim_id(b"To: You\r\nFrom: Me\r\n\r\nBody") |
| 'xx5nf02ptvv92tt73kg7n7o9o5t4ngvd' |
| >>> dkim_id(b"To: You\r\nFrom: Me\r\n\r\nBody\r\n") |
| 'b752nf3njqs9r5qwmrkh3n2s24y7y33g' |
| </pre> |
| </div> |
| </div> |
| </div> |
| </body> |
| </html> |