<?xml version="1.0" encoding="UTF-8"?>
<?oxygen RNGSchema="../../common/schema/DHQauthor-TEI.rng" type="xml"?>
<?oxygen SCHSchema="../../common/schema/dhqTEI-ready.sch"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:cc="http://web.resource.org/cc/"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
   xmlns:dhq="http://www.digitalhumanities.org/ns/dhq"
   xmlns:mml="http://www.w3.org/1998/Math/MathML">
   <teiHeader>
      <fileDesc>
         <titleStmt>
            <!--Author should supply the title and personal information-->
            <title type="article" xml:lang="en">Reassessing the locus of normalization in
               machine-assisted collation</title>
            <!--Add a <title> with appropriate @xml:lang for articles in languages other than English-->
            <dhq:authorInfo>
               <!--Include a separate <dhq:authorInfo> element for each author-->
               <dhq:author_name>David J. <dhq:family>Birnbaum</dhq:family>
               </dhq:author_name>
               <dhq:affiliation>University of Pittsburgh</dhq:affiliation>
               <email>djbpitt@gmail.com</email>
               <dhq:bio>
                  <p>David J. Birnbaum is Professor of Slavic Languages and Literatures at the
                     University of Pittsburgh. He has been involved in the study of electronic text
                     technology since the mid-1980s, participates actively in electronic text
                     technology conferences, and has served on the board of the Association for
                     Computers and the Humanities, the editorial board of <title rend="italic"
                        >Markup languages: theory and practice</title>, and the Text Encoding
                     Initiative Technical Council. Much of his electronic text work intersects with
                     his research in medieval Slavic manuscript studies and Russian poetry, but he
                     also often writes about issues in the philosophy of markup.</p>
               </dhq:bio>
            </dhq:authorInfo>
            <dhq:authorInfo>
               <!--Include a separate <dhq:authorInfo> element for each author-->
               <dhq:author_name>Elena <dhq:family>Spadini</dhq:family>
               </dhq:author_name>
               <dhq:affiliation>Université de Lausanne</dhq:affiliation>
               <email>elena.spadini@unil.ch</email>
               <dhq:bio>
                  <p> Elena Spadini is post-doctoral researcher at the University of Lausanne, where
                     she is developing a digital edition of Gustave Roud’s complete works. Elena
                     holds a PhD in Romance Philology from Sapienza Università di Roma and
                     specialized in Digital Humanities at the École nationale des chartes. From 2014
                     to 2017 she was a Marie Curie fellow in the ITN DiXiT (Digital Scholarly
                     Editions Initial Training) program. Elena has also co-edited a volume on
                     digital scholarly editing and published in international journals about digital
                     editing, Romance philology, textual variation, and manuscripts.</p>
               </dhq:bio>
            </dhq:authorInfo>
         </titleStmt>
         <publicationStmt>
            <publisher>Alliance of Digital Humanities Organizations</publisher>
            <publisher>Association of Computers and the Humanities</publisher>
            <publisher>Association for Computers and the Humanities</publisher>
            <!--This information will be completed at publication-->
            <idno type="DHQarticle-id">000489</idno>
            <idno type="volume">014</idno>
            <idno type="issue">3</idno>
                <date when="2020-09-25">25 September 2020</date>
            <dhq:articleType>article</dhq:articleType>
            <availability>
               <cc:License rdf:about="http://creativecommons.org/licenses/by-nd/2.5/"/>
            </availability>
         </publicationStmt>
         <sourceDesc>
            <p>This is the source</p>
         </sourceDesc>
      </fileDesc>
      <encodingDesc>
         <classDecl>
            <taxonomy xml:id="dhq_keywords">
               <bibl>DHQ classification scheme; full list available at <ref
                     target="http://www.digitalhumanities.org/dhq/taxonomy.xml"
                     >http://www.digitalhumanities.org/dhq/taxonomy.xml</ref>
               </bibl>
            </taxonomy>
            <taxonomy xml:id="authorial_keywords">
               <bibl>Keywords supplied by author; no controlled vocabulary</bibl>
            </taxonomy>
         </classDecl>
      </encodingDesc>
      <profileDesc>
         <langUsage>
            <language ident="en" extent="original"/>
            <!--add <language> with appropriate @ident for any additional languages-->
         </langUsage>
         <textClass>
            <keywords scheme="#dhq_keywords">
               <!--Authors may suggest one or more keywords from the DHQ keyword list, visible at http://www.digitalhumanities.org/dhq/taxonomy.xml; these may be supplemented or modified by DHQ editors-->
               <list type="simple">
                  <item/>
               </list>
            </keywords>
            <keywords scheme="#authorial_keywords">
               <!--Authors may include one or more keywords of their choice-->
               <list type="simple">
                  <item/>
               </list>
            </keywords>
         </textClass>
      </profileDesc>
      <revisionDesc>
         <change when="2020-08-13" who="jmurel">Finished encoding</change>
         <change when="2020-08-11" who="jmurel">Created filed</change>
      </revisionDesc>
   </teiHeader>
   <text xml:lang="en" type="original">
      <front>
         <dhq:abstract>
            <!--Include a brief abstract of the article-->
            <p>In this essay we explore the process of textual normalization in the context of
               machine-assisted collation, which is a common operation in digital textual
               scholarship. The Gothenburg modular architecture for computer-assisted collation
               situates normalization as the second stage within a five-stage computational
               pipeline, where it contributes to improving the eventual alignment. In this essay we
               argue that normalization, in fact, contributes not only to the alignment, but also to
               the interpretation of the texts. Furthermore, it occurs at several moments between
               transcription and rendering, and should not be regarded as happening all at once and
               only in one location.</p>
         </dhq:abstract>
         <dhq:teaser>
            <!--Include a brief teaser, no more than a phrase or a single sentence-->
            <p>Examines how the Gothenburg model of machines-assisted collation contributes to the
               interpretation of text data</p>
         </dhq:teaser>
      </front>
      <body>
         <div>
            <head>Introduction</head>
            <p>In this essay we explore the process of textual normalization in the context of
               machine-assisted collation, which is a common operation in digital textual
               scholarship. Collation, that is, the alignment of versions of (traditionally called
                  <q>witnesses to</q>) the same work, is used by scholars for studying the textual
               transmission or the genetic process of a work, often as a step in the preparation of
               a scholarly edition of that work. Machine-assisted collation refers to the use of
               computers for performing, in full or in part, this alignment. As such, it has a long
               history, as long as that of the Digital Humanities <ptr target="#nury2020"/>: over
               the past sixty years, not only have the machines at our disposal changed, but so have
               scholars’ understanding of the collation process, and the most refined computational
               model of collation available today is the one devised in 2009 in Gothenburg by the
               developers of the collation software Juxta and CollateX and by the textual scholars
               of the COST Action Open Scholarly Communities on the Web and of the program
               Interedition <ptr target="#dekker2011"/>
               <ptr target="#dekker2015"/>
               <ptr target="#bleeker2017"/>
               <ptr target="#interedition"/>. The Gothenburg modular architecture for
               computer-assisted collation is a five-stage computational pipeline, in which <hi
                  rend="bold">Normalization</hi> constitutes the optional second stage, where it
               contributes to improving the alignment.<note>We capitalize and embold <hi rend="bold">Normalization</hi>
                  when we refer to the second step in the Gothenburg model. When we refer to the
                  general concept of normalization (that is, of removing what the researcher
                  considers insignificant variation during the editorial process), independently of
                  the Gothenburg model, we render the word without special capitalization or other
                  formatting.</note>
            </p>
            <p>In this essay we argue that normalization contributes not only to the alignment, but
               also to the interpretation of the texts. Furthermore, it occurs at several moments of
               the collation process, between transcription and rendering, and should not be
               regarded as happening all at once and only in one location. Our point is not to
               question the modeling of collation as a computational pipeline, which has led to
               impressive, productive, and influential results, but to explore the complexity that
               is elided in the simpler description of <hi rend="bold">Normalization</hi> in the
               Gothenburg model, as well as the consequences of that complexity for conceptualizing
               and implementing a collation project. In fact, for reasons that we explore in detail
               below, an awareness of how pervasive normalization might be in machine-assisted
               collation is fundamental to devising a computational workflow that corresponds
               appropriately to the scholarly requirements of each project. The relationship between
               our understanding of normalization and our workflow is valid not only for researchers
               relying on (software that is organized according to) the Gothenburg model, but, more
               generally, as something to be taken into account by those dealing with
               machine-assisted collation. That is, the Gothenburg model offers a foundation for
               discussion, but our argument about the importance and ubiquity of normalization is
               not dependent on it. Furthermore, as often happens with the computational counterpart
               of a scholarly practice, machine-assisted collation tends to make explicit the
               assumptions implicit in manual collation: the study proposed here is largely
               applicable to collation in general, whether some operations are performed mentally or
               computationally. For this reason, the essay is addressed not only to textual
               scholars, but also to developers, who can find here a thorough analysis of the
               complex procedures implied in machine-assisted collation, with a particular focus on
               the different kinds of normalization that happen throughout the process.</p>
            <p>The essay first lays out preliminary remarks about what normalization is, its
               manifold forms and purposes, and provides an overview of the Gothenburg model. That
               model supplies the architecture for what follows: for each stage of the model, the
               potential role of normalization is discussed and illustrated with examples. In the
               conclusions, we summarize how this study contributes to a reconsideration of both the
               role and significance of normalization in collation and digital textual scholarship
               and, more narrowly, of the Gothenburg model of machine-assisted collation.</p>
            <div>
               <head>Normalization</head>
               <p>In the context of machine-assisted collation as formalized by the Gothenburg
                  model, described below, <hi rend="bold">Normalization</hi> refers to the creation
                  and use of shadow copies of word tokens when performing alignment. A common and
                  simple type of normalization is case-folding. For example, although <q>cat</q> and
                     <q>Cat</q> are not string-equal, collation processing might create a
                  lower-cased shadow copy of each word in the text before performing the comparisons
                  needed for alignment. As a result, although the output might retain the original
                  casing, the alignment engine would use the shadow copies for comparison, and would
                  therefore recognize that the difference between <q>cat</q> and <q>Cat</q> is not
                  significant for alignment purposes.</p>
               <p>In this essay we use <hi rend="bold">Normalization</hi> (capitalized and embolded)
                  to refer to the second, optional, stage of the Gothenburg model, in which the
                  differences in witness tokens that should be ignored for alignment purposes are
                  neutralized, or <q>normalized,</q> prior to alignment. We use normalization
                  (without special capitalization or other formatting) to refer to normalization as
                  a general concept, present in many forms distributed over several stages of the
                  pipeline, and affecting not only Stage 2 (<hi rend="bold">Normalization</hi>), but
                  also, in particular, Stage 0 (<hi rend="bold">Transcription</hi>), Stage 1 (<hi
                     rend="bold">Tokenization</hi>), and Stage 4 (<hi rend="bold">Analysis</hi>).
                  The distinction is instrumental for the main argument of this essay, anticipated
                  above, which is that normalization does not occur only at the <hi rend="bold"
                     >Normalization</hi> stage, and it is, instead, pervasive in machine-assisted
                  collation.</p>
               <p>Normalization can be performed at different textual levels and it affects many
                  types of textual variation. For example, in addition to the case-folding described
                  above, researchers might wish to neutralize the <hi rend="bold">graphemic</hi>
                  distinction between Latin alphabet regular <q>s</q> (U+0073 LATIN SMALL LETTER S)
                  and long <q>ſ</q> (U+017F LATIN SMALL LETTER LONG S). <hi rend="bold"
                     >Orthographic</hi> variation may also transcend the graphemic level, as in the
                  orthographic distinction between US English <q>color</q> and British
                     <q>colour,</q> or the logographic replacement, rooted in homonymy, of the
                  English preposition <q>to</q> by the digit <q>2</q> in a telegraphic communicative
                  style popular on social networks. <hi rend="bold">Morphological</hi> variation
                  includes differences in categories like tense, gender, number, and others; for
                  example, a collation operation might want to recognize that different inflected
                  forms of the same lexeme (such as English <q>is</q> and <q>are,</q> which have no
                  characters in common) may be textual variants, and are therefore potential
                  candidates for alignment. <hi rend="bold">Lexical</hi> variants might be
                  identified by semantic features; for example, if one manuscript witness reads
                     <q>books and magazines</q> and another reads <q>journals,</q> we might want to
                  align <q>journals</q> with <q>magazines,</q> rather than with <q>books,</q>
                  because <q>journals</q> is semantically closer to <q>magazines</q> than it is to
                     <q>books.</q> For multilingual alignment of witnesses in languages with greatly
                  different orthographies (such as Greek [Greek script] and Old Church Slavonic
                  [Glagolitic or Cyrillic script]), normalization might take the form of recording
                  only the part of speech in the shadow copy, since in this research context, part
                  of speech is a better predictor of the alignment of variants than any modification
                  of the orthographic strings <ptr target="#birnbaum2018"/></p>
               <p>Normalization, in a nutshell, makes it possible to identify phenomena on multiple
                  orthographic and linguistic levels and use them to create surrogates for the
                  literal word tokens that then shape and interpret the results of collation,
                  tacitly neutralizing and ignoring other distinctions that are present in the
                  literal tokens. During the process of normalization, an original form is replaced
                  (or, rather, shadowed, since the original form is typically retained and is
                  available for use in the eventual rendered output) by a normalized form. The point
                  of this deliberate neutralization is that different original forms that are
                  normalized (or, in computational terms, that <term>hash</term>) to the same value
                  are deemed to be the same at a certain point of the process of machine-assisted
                  collation. </p>
            </div>
            <div>
               <head>Machine-assisted collation: the Gothenburg model</head>
               <p>Within a computational pipeline, the output of one process becomes the input to
                  the next, much as water may flow through plumbing that is constructed by
                  concatenating small pieces of piping <ptr target="#mcilroy1964"/>.<note>In the
                     most common case, the entire output of one process becomes the entire input
                     into the next, although Unix piping also supports tee connections, which send
                     the same output to two destinations, typically stdout and a specified file or
                     variable.</note> Under the Gothenburg model, the digitized text of each witness
                  is piped first through <hi rend="bold">Tokenization</hi> (Stage 1) and then
                  through <hi rend="bold">Normalization</hi> (Stage 2), at which point it is joined
                  with all other witnesses to constitute the complex input into <hi rend="bold"
                     >Alignment</hi> (Stage 3). The single output of <hi rend="bold">Alignment</hi>
                  is then piped through <hi rend="bold">Analysis</hi> (Stage 4) and <hi rend="bold"
                     >Visualization</hi> (Stage 5) to generate the eventual output of the collation
                  operation. This is illustrated in Figure 1, below, where three witnesses are
                  collated, and Stage 2 (<hi rend="bold">Normalization</hi>) is highlighted in
                     green.<note>The dotted line indicates that Stage 4 (<hi rend="bold"
                        >Analysis</hi>) may pipe its output into another Stage 3 (<hi rend="bold"
                        >Alignment</hi>) process, representing a situation where <hi rend="bold"
                        >Analysis</hi> may require a new <hi rend="bold">Alignment</hi>, which may
                     or may not be subjected to additional <hi rend="bold">Analysis</hi> (or
                     additional cycles of <hi rend="bold">Analysis</hi> and <hi rend="bold"
                        >Alignment</hi>) before visualization. As <ref target="#dekker2011"
                        >Haentjens Dekker and Middell (2011)</ref> write: <quote
                        source="#dekker2011" rend="block">an additional analytical step in which the
                        alignment results are augmented (and optionally fed back as preconditions
                        into the collator) appears essential to us in order to bridge the
                        methodological <q>impedance</q> between a plain computational approach and
                        the established hermeneutical <q>best-practice</q> approach to the
                        problem.</quote>
                  </note>
               </p>
               <figure>
                  <head>Gothenburg model.</head>
                  <graphic url="resources/images/figure01.png"/>
               </figure>
               <p>The key contribution of the Gothenburg model to our understanding of collation is
                  its modularization of the process in a way that facilitates customization<note>
                     The adoption of the design principle called <term>separation of concerns</term>
                     in the field of automatic collation dates back to the 1970s. See, e.g., <ref
                        target="#gilbert1973" loc="144">Gilbert (1973), 144</ref>: <quote
                        rend="inline">The program is modular in design, that is, it consists of
                        several steps each doing a simple task.</quote> The importance of this
                     modularity in tool design in general, and not only for collation, is
                     highlighted also in <ref target="#rockwell1999">Rockwell and Bradley
                        (1999)</ref>, section 1.1: <quote rend="block">The unforeseeable nature of
                        original research means that a research tool cannot be extended in a
                        predictable way to meet the demands of research. What is needed are research
                        tools whose every aspect can [be] modified and extended through easily added
                        modules. In other words, to be truly extensible, a tool should be capable of
                        having any component replaced.</quote></note> and is agreed upon by a
                  community of users.<note>The Gothenburg model originated in the context of
                     Interedition, a European-funded collaborative research project whose aim was
                        <quote rend="inline">to encourage the creators of tools for textual
                        scholarship to make their functionality available to others, and to promote
                        communication between scholars so that we can raise awareness of innovative
                        working methods</quote>
                     <ptr target="#interedition"/></note> Instead of a monolithic black box that
                  ingests witness input and extrudes a collation as a single process (that is, a
                  degenerate pipeline with a single step), a collation system built according to the
                  Gothenburg model, such as CollateX, can expose hooks that support developer
                  intervention at any stage of the process without a need to interact explicitly
                  with the other stages. For example, a user can replace the default CollateX <hi
                     rend="bold">Normalization</hi> with a customized alternative (see below)
                  without having to touch (or, for the most part, even know about) the
                  implementation of the other Gothenburg stages. As long as the input and output
                  format of each stage is documented, replacing a step in the pipeline is an
                  autonomous operation, that is, one that does not require accommodation elsewhere
                  in the system. In this report we rely for illustrative examples on CollateX, which
                  may be considered the reference implementation of the Gothenburg model.<note>Our
                     examples are drawn from the Python version of the software, which is available
                     at <ref target="https://pypi.python.org/pypi/collatex"
                        >https://pypi.python.org/pypi/collatex</ref>; the Java version is available
                     at <ref target="https://collatex.net/">https://collatex.net</ref>. See also the
                     discussion of the implementation of the Gothenburg model in Juxta at <ref
                        target="http://juxtacommons.org/tech_info"
                        >http://juxtacommons.org/tech_info</ref>. </note>
               </p>
            </div>
         </div>
         <div>
            <head>Discussion</head>
            <p>This essay challenges the characterization of normalization in the Gothenburg model
               as something that happens in a single location in the chain of processes (at Stage 2,
               called <hi rend="bold">Normalization</hi>). Instead, as we illustrate below,
               normalization may be performed in four different stages within the overall collation
               process, not only to improve the alignment, but also to annotate the sources with a
               layer of information that may be vital for subsequent analysis. In the following
               description of the five stages of the Gothenburg model (preceded by <hi rend="bold"
                  >Transcription</hi>, which we call Stage 0), the notation <q>(n)</q> after the
               section heading means that normalization plays a role in the stage.</p>
            <div>
               <head>Stage 0. Transcription for the purpose of collation (n)</head>
               <p>Except in the case of born-digital character-based data, witnesses to be input
                  into a collation engine must first be digitized, that is, transcribed, whether
                  manually or with machine assistance (e.g., OCR). In the case of handwritten
                  documents, this transcription entails the conversion of an analog original to a
                  digital character-stream surrogate. This transcription necessarily is not a
                  one-to-one mapping because the handwritten analog original is infinitely variable,
                  while the distinct character types in the digital result are limited by the
                  character inventory. In practice, the level of detail in the transcription (that
                  is, the extent to which it is <q>diplomatic</q>) depends on the goals and purposes
                  of the edition, and for that reason, <quote rend="inline">transcripts are best
                     judged on how useful they will be ..., rather than as an attempt to achieve a
                     definitive transcription</quote>
                  <ptr target="#robinson1993" loc="19"/>. Furthermore, Robinson and Solopova
                  continue, because the complete character representation of all features on a
                  handwritten original is both impossible and undesirable, transcriptions <quote
                     rend="inline">must be seen as fundamentally incomplete and fundamentally
                     interpretative</quote>
                  <ptr target="#robinson1993" loc="21"/>.</p>
               <p>The conflation of multiple physically different written signs into the same single
                  digital characters is an instance of normalization, that is, of determining,
                  sometimes subconsciously, that certain objective differences may (or, perhaps,
                  should) be ignored because they are not relevant informationally.<note>Especially
                     in the early days of computers, hardware and software constraints limited ways
                     of representing digital texts; see, e.g., <ref target="#froger1968" loc="230–2"
                        >Froger (1968), 230–32</ref>. Conventions were quickly adopted to overcome
                     these limitations, such as the use of the $ sign as a diacritic to indicate a
                     capital letter.</note> The risk of normalization during manual transcription,
                  though, is that information that is discarded during transcription is not
                  available at later stages of collation, or during subsequent use of the collation
                  output in research. Researchers are nonetheless comfortable with this type of
                  normalization not only because it is inevitable, for the reasons described above,
                  but also because they are confident that the data that they are excluding is not
                  informational. The alternative, that is, trying to transcribe with as little
                  normalization as possible, not only cannot be perfect, but also comes at a cost,
                  because the greater the number of graphic distinctions the researcher tries to
                  preserve during transcription, the greater the price in terms of both efficiency
                  (because the researcher must be alert to more details) and accuracy (because there
                  is more opportunity for distraction, error, and inconsistency) <ptr
                     target="#robinson1993" loc="25"/>. A sensible compromise, especially in
                  situations where photographic facsimiles are available, and therefore reduce the
                  documentary value of a <hi rend="italic">hyperdiplomatic</hi> transcription, is
                  that the digital transcription should preserve differences in the original
                  orthography that might be needed for adequate rendering (as determined by the
                  goals of the project) or serve as eventual input into computational analysis
                  (which might include alignment within the collation process, subsequent analysis
                  of patterns of variation, or orthographic or linguistic analysis that is not
                  connected explicitly to collation).<note>A manuscript is typically a multilayered
                     reality <ptr target="#sels2015"/>, where various systems of language and
                     meaning coexist <ptr target="#segre1976"/>. Useful tokenization and
                     normalization demands careful attention to distinguishing orthographic and
                     linguistic features of the text from those of the manuscript. </note>
               </p>
               <p>Normalization during transcription need not be entirely silent. For example, as a
                  way of accommodating both diplomatic and normalized representations during
                  transcription, the Text Encoding Initiative (TEI) makes it possible to couple the
                  two formally where that is sensible and practical, as in the following example
                  from the TEI P5 Guidelines:<note>
                     <ref target="http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-orig.html"
                        >http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-orig.html</ref>
                  </note><quote rend="block"><eg lang="xml"><![CDATA[<l>But this will be a <choice>
   <orig>meere</orig>
   <reg>mere</reg>
</choice> confusion</l>
<l>And hardly shallter we all be <choice>
   <orig>vnderstoode</orig>
   <reg>understood</reg>
</choice>
</l>]]></eg></quote>
This strategy makes it possible, outside a
                  collation context, to index or search for words according to their normalized
                  forms while rendering them according to the variant orthography that appears in
                  the source. It is nonetheless the case that even the <code>&lt;orig&gt;</code>
                  reading necessarily undergoes some degree of normalization as part of the
                  transcription process.</p>
            </div>
            <div>
               <head>Stage 1. Tokenization (n)</head>
               <p>The alignment of segments of text for collation presupposes the division of a
                  single continuous stream of characters into tokens, typically words (however we
                  define them) and punctuation marks, although nothing precludes other forms of
                     tokenization.<note>Tokenization and normalization are well-known operations in
                     computational and corpus linguistics, and their use in CollateX is similar to
                     the analogous linguistic operations. Cf. tokenization in computer science,
                     sometimes called <term>lexing</term> or <term>lexical analysis</term>, which
                     shares with linguistic tokenization the goal of dividing a continuous stream of
                     characters into meaningful substrings for subsequent processing. In computer
                     science, the subsequent processing is typically <term>parsing</term>, that is,
                     interpreting the tokens as expressions according to the syntax of a programming
                     language.</note> CollateX, for example, incorporates a default tokenization
                  function that splits the input into sequences of word characters (regex <code>\w+</code>) plus
                  any trailing whitespace (regex <code>\s+</code>), and further breaks off
                  punctuation marks together with any trailing whitespace into their own tokens
                  (regex <code>\W+</code>). In this way, for example, a word followed by a period at
                  the end of a sentence will be recognized as string-equal to the same word without
                  the period in a different witness. </p>
               <p>How to manage whitespace, which is one of the common issues that must be resolved
                  during tokenization, might also entail forms of normalizations. Although
                  tokenization on whitespace in some other programming contexts (such as the XPath
                     <code>tokenize()</code> function or the Python <code>str.split()</code> method
                  with a null <term>separator</term> value) may regard whitespace as a separator
                  between tokens that should be discarded, default tokenization in CollateX, for
                  example, keeps the whitespace as the trailing part of the preceding token, so that
                  nothing is discarded during tokenization.<note> In Python regular expressions,
                        <code>\s</code>
                     <quote rend="inline">[m]atches any whitespace character; equivalent to <code>[
                           \t\n\r\f\v]</code> in bytes patterns or string patterns with the ASCII
                        flag. In string patterns without the ASCII flag, it will match the whole
                        range of Unicode whitespace characters</quote>
                     <ptr target="#secretlabs2001"/>. See <ref target="#unicode2020">Unicode
                        Consortium (2020)</ref> for an inventory and discussion of Unicode
                     whitespace characters.</note> This whitespace is subsequently removed in
                  CollateX by the default <hi rend="bold">Normalization</hi> stage unless that is
                  overridden (see the following section), thus preserving the strict Gothenburg
                  model distinction between <hi rend="bold">Tokenization</hi> and <hi rend="bold"
                     >Normalization</hi>. Yet regarding all whitespace characters as equivalent for
                  tokenizing the input, and regarding sequences of multiple whitespace characters as
                  equivalent to a single space, both of which are part of the default implementation
                  of <hi rend="bold">Tokenization</hi> in CollateX, are forms of normalization, that
                  is, situations where forms that are not string-equal are nonetheless deemed to be
                  equivalent for a particular purpose. </p>
               <p>Thanks to the modular architecture of the Gothenburg model adopted by CollateX,
                  which provides hooks into the <hi rend="bold">Tokenization</hi> stage, users can
                  replace the default <hi rend="bold">Tokenization</hi> with custom code without
                  needing to know about or otherwise touch the code that manages the other stages of
                     processing.<note>The default implementation of Tokenization in the Python
                     version of CollateX is performed by the <code>tokenize()</code> method of the
                     WordPunctuationTokenizer class inside <code>core_classes.py:
                        re.findall(r'\w+\s*|\W+', contents)</code>. This defines a token as 1)
                     either a sequence of word characters followed optionally by a sequence of
                     whitespace characters, or 2) a sequence of non-word characters, which is
                     typically punctuation, also followed optionally by a sequence of whitespace
                     characters. This means that a token may consist entirely of whitespace only
                     when it falls at the beginning of a text (where it will match <code>\W+</code>), since any
                     whitespace not at the beginning of the text will form the trailing part of the
                     token that begins with whatever word or punctuation characters precede
                     it.</note> For example, where punctuation is not intended to be used in
                  alignment but must nonetheless remain available in the final output, it is
                  possible to tokenize only on sequences of whitespace characters, regarding
                  punctuation not as a separate token, but as part of the word token that precedes
                  or follows it, where it can then be ignored (during Stage 2) for the alignment
                  (during Stage 3).</p>
            </div>
            <div>
               <head>Stage 2. Normalization (n)</head>
               <p>The Gothenburg model regards <hi rend="bold">Normalization</hi> primarily as a way
                  of improving Alignment by recognizing that tokens that are not string-equal should
                  nonetheless be deemed to be equivalent for the purpose of <hi rend="bold">Alignment</hi>: <cit><quote
                        rend="block">It might suffice to normalize the tokens’ textual content such
                        that an exact matching of the normalized content yields the desired
                        equivalence relation. For instance, in many cases all tokens of the text
                        versions are normalized to their lower-case equivalent before being
                        compared, thereby making their comparison case insensitive. Other examples
                        would be the removal of punctuation, the rule-based normalization of
                        orthographic differences or the stemming of words.</quote>
                     <ptr target="#collatex"/></cit> As implemented within CollateX, a token is a
                  complex object that contains at least two properties: a t (<q>text</q>) property,
                  which represents the string value of the token after <hi rend="bold"
                     >Tokenization</hi>, and an n (<q>normalized</q>) property, which represents the
                  result of applying <hi rend="bold">Normalization</hi> to the t value. The software
                  uses agreement among the n properties to recognize tokens that should be aligned.
                  The t and n values are created either by the built-in default <hi rend="bold"
                     >Tokenization</hi> and default <hi rend="bold">Normalization</hi>
                     operations,<note>Default tokenization in CollateX is described above. Default
                     normalization is limited to stripping trailing whitespace in the Python
                     version, and includes both that and case folding in the Java version. In the
                     Python version, the normalization is created for objects of the Witness class
                     (defined inside <code>core_classes.py</code>) with: <code>Token({'t':
                        token_string, 'n': re.sub(r'\s+$', '', token_string)})</code>. The regular
                     expression <code>\s+$</code> matches one or more whitespace characters at the
                     end of the token; the <code>re.sub()</code> function replaces them with the
                     empty string, that is, removes them from the token before writing the modified
                     form as the value of the n property.</note> or by custom operations implemented
                  by the researcher to replace the defaults. The fact that each token has the t and
                  n properties means that CollateX exposes the output of <hi rend="bold"
                     >Tokenization</hi> alongside the output of <hi rend="bold">Normalization</hi>,
                  providing hooks for customization that make the software useful for scholars with
                  a wide variety of editorial requirements.</p>
               <p><hi rend="bold">Normalization</hi> at Stage 2 in the Gothenburg process serves a
                  specific purpose: it is performed so that variation that the editor considers
                  unimportant for alignment will not influence Stage 3, when <hi rend="bold"
                     >Alignment</hi> is performed.<note>Neutralizing phenomena that are not
                     considered relevant for the purpose of alignment so that they would not add
                     noise to the output was a concern also in the early days of automatic
                     collation; see <ref target="#nury2020">Nury and Spadini (2020)</ref>, <ref
                        target="#silva1969">Silva and Love (1969), 93</ref>; <ref
                        target="#gilbert1979" loc="247">Gilbert (1979), 247</ref>; <ref
                        target="#robinson1989" loc="100–1">Robinson (1989), 100–01</ref>.</note>
                  This is different from the purpose of normalization during <hi rend="bold"
                     >Transcription</hi> (which we called Stage 0, above) or during <hi rend="bold"
                     >Analysis</hi> (Stage 4, below), and it has different consequences. If the
                  editor normalizes the text during <hi rend="bold">Transcription</hi> and does not record the
                  non-normalized forms, those forms become irretrievable. <hi rend="bold"
                     >Normalization</hi> at Stage 2 of the Gothenburg model, however, is
                  non-destructive, since the normalized form is created as a shadow copy of the
                  original, and not as a replacement for it. As Bleeker explains: <cit><quote
                        rend="block"><q>normalization</q> in the context of automated collation is
                        not equivalent to normalization that happens in transcription. For example,
                        editors can transcribe orthographic variation because they consider it
                        important to be preserved in both the transcription and the collation
                        output. However, in the collation process itself, they may want to normalize
                        orthographic variation because they do not want it to influence the
                        alignment. In that case they need to normalize their tokens before inputting
                        them in the collation software.</quote><ptr target="#bleeker2017" loc="94"
                     /></cit> Common types of normalization, some of which were mentioned in the
                  introduction, above, are discussed individually in the subsections below. Many of
                  these examples might have been aligned correctly even without normalization
                  because of forced matches, near-matching (see below), or — in situations where the
                  software can decide only arbitrarily among alternatives — by chance.<note>We use
                     the term <term>forced match</term> to designate a situation where a single
                     token is sandwiched between unambiguous matches of all witnesses on either
                     side, as in the case-folding example immediately below. The match is forced
                     because, with the neighbors fully aligned, the tokens between them, whether the
                     same or different, are forced into alignment.</note> But because of the
                  computational complexity of alignment, normalization that leads to the
                  identification of more exact matches (of the normalized shadows of the tokens) can
                  nonetheless improve both the accuracy and the efficiency of the overall
                  performance.</p>
               <div>
                  <head>Case-folding (orthographic)</head>
                  <p>In the following example from the <title rend="italic">Frankenstein
                        variorum</title> Project, the case distinction between <q>SAVILLE</q> and
                        <q>Saville</q> in the third token column is not regarded as significant for
                     <hi rend="bold">Alignment</hi> purposes:<note> See <title rend="italic">Frankenstein
                           variorum</title>. In the source files used in the project, this heading
                        also includes presentational markup, which could be recruited to override
                        case distinctions in the character data as part of a strategy for
                        controlling normalization.</note>
                  
                  <table>
                     <head>Case distinction in the <title rend="italic">Frankenstein
                           variorum</title> Project</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>1818</cell>
                        <cell>To</cell>
                        <cell>Mrs.</cell>
                        <cell>SAVILLE,</cell>
                        <cell>England.</cell>
                     </row>
                     <row role="data">
                        <cell>1823</cell>
                        <cell>To</cell>
                        <cell>Mrs.</cell>
                        <cell>SAVILLE,</cell>
                        <cell>England.</cell>
                     </row>
                     <row role="data">
                        <cell>Thomas</cell>
                        <cell>To</cell>
                        <cell>Mrs.</cell>
                        <cell>SAVILLE,</cell>
                        <cell>England.</cell>
                     </row>
                     <row role="data">
                        <cell>1831</cell>
                        <cell>To</cell>
                        <cell>Mrs.</cell>
                        <cell>Saville,</cell>
                        <cell>England.</cell>
                     </row>
                  </table>
                  These tokens would have been aligned correctly in any case because they
                     constitute a forced match between <q>Mrs.</q> and <q>England.</q> But the
                     information stored and processed during the <hi rend="bold">Normalization</hi> stage is available
                     not only for <hi rend="bold">Alignment</hi>, but also, for example, for
                     distinguishing, during subsequent <hi rend="bold">Analysis</hi>, forced matches
                     that are not deemed equivalent from those that are.</p>
               </div>
               <div>
                  <head>Graphemic (orthographic)</head>
                  <p>In early Cyrillic writing, <q>з</q> (U+437 CYRILLIC SMALL LETTER ZE) and
                        <q>ѕ</q> (U+0455 CYRILLIC SMALL LETTER DZE) had different pronunciations and
                     distribution in Old Church Slavonic, but came to be used largely
                     interchangeably in early East Slavic writing, including in the <title
                        rend="italic">Rus′ primary chronicle</title>.<note>See <ref target="#pvl"
                           >PVL</ref>. Cyrillic used letters of the alphabet as numerical digits,
                        and these two letters continued to be distinguished in early Cyrillic when
                        they represented numerical values even after any distinction between them
                        had largely ceased to be significant in words.</note> In the last column in
                     the following example from that work (4,8), the Xle witness uses one variant
                     and other witnesses agree on the other, and the editors neutralize the
                     distinction at the <hi rend="bold">Normalization</hi> stage, so that the forms
                     will be deemed equivalent for purposes of <hi rend="bold"
                        >Alignment</hi>:<note>Normalization in support of alignment in this edition
                        also implements case folding, neutralizes several other character
                        distinctions (including <q>ѧ</q> vs <q>я</q> in this word), and ignores line
                        breaks (represented by a vertical bar), punctuation, and superscription. The
                        examples in this article simplify the actual CollateX output by representing
                        characters tagged with <code>&lt;sup&gt;</code> tags as superscript
                        characters and removing the <code>&lt;sup&gt;</code> tags and all others
                        (principally <code>&lt;lb&gt;</code> and <code>&lt;pb&gt;</code>). This
                        markup is removed from the n properties when they are created during Stage 2
                           <hi rend="bold">Normalization</hi>, and therefore is not involved in
                        Alignment decisions.</note>
                  </p>
                  <table>
                     <head>Graphemic variation in the <title rend="italic">Rus′ primary
                           chronicle</title></head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>Lav </cell>
                        <cell>по</cell>
                        <cell>семуже</cell>
                        <cell>морю</cell>
                        <cell>сѣдѧть</cell>
                        <cell>варѧ|зи</cell>
                     </row>
                     <row role="data">
                        <cell>Tro </cell>
                        <cell>по</cell>
                        <cell>семуже</cell>
                        <cell>морю</cell>
                        <cell>сѣдять</cell>
                        <cell>варязи</cell>
                     </row>
                     <row role="data">
                        <cell>Rad</cell>
                        <cell>по |</cell>
                        <cell>семоуже</cell>
                        <cell>морю</cell>
                        <cell>приседѧть</cell>
                        <cell>варѧзи</cell>
                     </row>
                     <row role="data">
                        <cell>Aka</cell>
                        <cell>по</cell>
                        <cell>семѹже</cell>
                        <cell>морю</cell>
                        <cell>приседѧтъ</cell>
                        <cell>варѧзи.</cell>
                     </row>
                     <row role="data">
                        <cell>Ipa </cell>
                        <cell>по</cell>
                        <cell>семуже</cell>
                        <cell>морю</cell>
                        <cell>сѣдѧть</cell>
                        <cell>ва|рѧзи</cell>
                     </row>
                     <row role="data">
                        <cell>Xle</cell>
                        <cell>по</cell>
                        <cell>семоуже</cell>
                        <cell>морю</cell>
                        <cell>сѣдѧт</cell>
                        <cell>варѧѕи.</cell>
                     </row>
                     <row role="data">
                        <cell>Byč </cell>
                        <cell>по</cell>
                        <cell>семуже</cell>
                        <cell>морю</cell>
                        <cell>сѣдять</cell>
                        <cell>Варязи</cell>
                     </row>
                     <row role="data">
                        <cell>Šax </cell>
                        <cell>по</cell>
                        <cell>семуже</cell>
                        <cell>морю</cell>
                        <cell>сѣдять</cell>
                        <cell>Варязи</cell>
                     </row>
                     <row role="data">
                        <cell>Lix</cell>
                        <cell>По</cell>
                        <cell>сему&amp;#xao;же</cell>
                        <cell>морю</cell>
                        <cell>сѣдять</cell>
                        <cell>варязи</cell>
                     </row>
                     <row role="data">
                        <cell>Ost </cell>
                        <cell>По</cell>
                        <cell>семуже</cell>
                        <cell>морю</cell>
                        <cell>сѣдять</cell>
                        <cell>Варязи</cell>
                     </row>
                  </table>
               </div>
               <div>
                  <head>Spelling (orthographic)</head>
                  <p>The 1818 and 1823 editions of Mary Shelley’s <title rend="italic"
                        >Frankenstein</title> regularly use the US-associated spelling of the verbal
                     suffix <q>-ize,</q> while the 1831 edition regularly uses the
                     British-associated spelling <q>-ise.</q> In the <title rend="italic"
                        >Frankenstein variorum</title> Project, this distinction can be neutralized
                     before performing <hi rend="bold">Alignment</hi>, as in the case of
                        <q>tranquillize</q> and <q>tranquillise</q> in the seventh token column in
                     the following example. </p>
                  <table>
                     <head>Spelling variation in the <title rend="italic">Frankenstein
                           variorum</title> Project</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>1818</cell>
                        <cell>for</cell>
                        <cell>nothing</cell>
                        <cell>contributes</cell>
                        <cell>so</cell>
                        <cell>much</cell>
                        <cell>to</cell>
                        <cell>tranquillize</cell>
                        <cell>the</cell>
                        <cell>mind</cell>
                     </row>
                     <row role="data">
                        <cell>1823</cell>
                        <cell>for</cell>
                        <cell>nothing</cell>
                        <cell>contributes</cell>
                        <cell>so</cell>
                        <cell>much</cell>
                        <cell>to</cell>
                        <cell>tranquillize</cell>
                        <cell>the</cell>
                        <cell>mind</cell>
                     </row>
                     <row role="data">
                        <cell>1831</cell>
                        <cell>for</cell>
                        <cell>nothing</cell>
                        <cell>contributes</cell>
                        <cell>so</cell>
                        <cell>much</cell>
                        <cell>to</cell>
                        <cell>tranquillise</cell>
                        <cell>the</cell>
                        <cell>mind</cell>
                     </row>
                  </table>
                  <p>The same phenomenon can be observed in the following example, from the <title
                        rend="italic">Lancelot</title> manuscript transmission <ptr
                        target="#spadini2016"/>, where Witness A preserves an archaic spelling and
                     Witnesses B and C reflect the modern one:</p>
                  <table>
                     <head>Spelling variation in <title rend="italic">Lancelot en
                        prose</title></head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>A</cell>
                        <cell>ge</cell>
                        <cell>-</cell>
                        <cell>te</cell>
                        <cell>conois</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>artu</cell>
                        <cell>je</cell>
                        <cell>te</cell>
                        <cell>conois</cell>
                     </row>
                     <row role="data">
                        <cell>C</cell>
                        <cell>artus</cell>
                        <cell>je</cell>
                        <cell>te</cell>
                        <cell>conois</cell>
                     </row>
                  </table>
                  <p>When CollateX fails to find an exact string match during Alignment and there
                     are multiple options for placing a token (in this case, <q>ge</q> may be placed
                     in the first or second token column), the software arbitrarily defaults to the
                     leftmost position, which, in the example above, is philologically
                           incorrect.<note><hi rend="bold">Alignment</hi> in CollateX by default
                        distinguishes only exact matches and non-matches, and it has no
                        understanding of <term>near matching</term>, that is, of finding the closest
                        inexact match. In this case that means that the <hi rend="bold"
                           >Alignment</hi> stage alone cannot recognize that <q>ge</q> is more
                        similar to <q>je</q> than it is to <q>artu(s).</q></note> But by normalizing
                     the two different spellings of the pronoun, <q>ge</q> in Witness A is aligned
                     correctly:</p>
                  <table>
                     <head>Correct alignment of spelling variants in <title rend="italic">Lancelot
                           en prose</title></head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>A</cell>
                        <cell>-</cell>
                        <cell>ge</cell>
                        <cell>te</cell>
                        <cell>conois</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>artu</cell>
                        <cell>je</cell>
                        <cell>te</cell>
                        <cell>conois</cell>
                     </row>
                     <row role="data">
                        <cell>C</cell>
                        <cell>artus</cell>
                        <cell>je</cell>
                        <cell>te</cell>
                        <cell>conois</cell>
                     </row>
                  </table>
                  <p>The <title rend="italic">Rus′ primary chronicle</title> manuscripts illustrated
                     above vary orthographically for reasons that are both properly orthographic
                     (that is, concerned with scribal conventions for writing correctly) and
                     underlyingly phonetic (insofar as the manuscripts were written at different
                     times, they do not all represent the same state of the language). Identifying
                     all individual neutralizations of letter differences for normalization is not
                     practical because of the complexity of the systems and their relationships to
                     one another. It is, however, possible to overcome this limitation with a
                     normalization scheme that retains only the parts of the tokens that have a high
                     information load with respect to grouping readings into variant sets. For this
                     project we implemented a normalization scheme based on a modification of
                     Soundex <ptr target="#soundex"/>, recognizing that, in these writing systems, the beginning of
                     the word has a higher information load than the end, consonants have a higher
                     information load than vowels, and some phonetic distinctive features have a
                     higher information load than others. For details and examples see <ref
                        target="#birnbaum2015">Birnbaum 2015</ref>.</p>
               </div>
               <div>
                  <head>Digits (orthographic)</head>
                  <p>The distinction between numbers spelled as digits and those spelled as words
                     may be neutralized for alignment purposes. The following example is from the
                        <title rend="italic">Rus′ primary chronicle</title> (1,2), where, in the
                     fourth token column, Ost uses an Arabic numeral; Rad, Aka, and Ipa use a
                     Cyrillic letter to represent a numerical value, followed by a grammatical
                     ending; and the other witnesses spell out the entire number as a word (a
                     different number in Lav than in the others).</p>
                  <table>
                     <head>Number rendering in the <title rend="italic">Rusʹ primary
                           chronicle</title></head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>Lav </cell>
                        <cell>по</cell>
                        <cell>потопѣ.</cell>
                        <cell/>
                        <cell>первие</cell>
                        <cell>снве</cell>
                        <cell>ноеви.</cell>
                        <cell>раздѣлиша</cell>
                     </row>
                     <row role="data">
                        <cell>Tro </cell>
                        <cell>по</cell>
                        <cell>потопѣ</cell>
                        <cell/>
                        <cell>трие</cell>
                        <cell>сынове</cell>
                        <cell>ноеви</cell>
                        <cell>раздѣлиша</cell>
                     </row>
                     <row role="data">
                        <cell>Rad </cell>
                        <cell>по</cell>
                        <cell>потопѣ. </cell>
                        <cell/>
                        <cell>г҃е </cell>
                        <cell>сн҃ве</cell>
                        <cell>ноеви. </cell>
                        <cell>разделиша</cell>
                     </row>
                     <row role="data">
                        <cell>Aka </cell>
                        <cell>по</cell>
                        <cell>потопе</cell>
                        <cell/>
                        <cell>.г҃.е </cell>
                        <cell>сн҃ве</cell>
                        <cell>ноеви.</cell>
                        <cell>разделиша</cell>
                     </row>
                     <row role="data">
                        <cell>Ipa </cell>
                        <cell>по</cell>
                        <cell>потопѣ</cell>
                        <cell>бо</cell>
                        <cell>.г҃.е. </cell>
                        <cell>с҃нве</cell>
                        <cell>ноеви</cell>
                        <cell>роздѣлиша</cell>
                     </row>
                     <row role="data">
                        <cell>Xle </cell>
                        <cell>Пѡ</cell>
                        <cell>потопѣ</cell>
                        <cell>оубо</cell>
                        <cell>трїе</cell>
                        <cell>с҃нове</cell>
                        <cell>ноеви.</cell>
                        <cell>раздѣлиша</cell>
                     </row>
                     <row role="data">
                        <cell>Byč </cell>
                        <cell>По</cell>
                        <cell>потопѣ</cell>
                        <cell/>
                        <cell>трие</cell>
                        <cell>сынове</cell>
                        <cell>Ноеви</cell>
                        <cell>раздѣлиша</cell>
                     </row>
                     <row role="data">
                        <cell>Šax </cell>
                        <cell>По</cell>
                        <cell>потопѣ</cell>
                        <cell>убо</cell>
                        <cell>трие</cell>
                        <cell>сынове</cell>
                        <cell>Ноеви</cell>
                        <cell>раздѣлиша</cell>
                     </row>
                     <row role="data">
                        <cell>Lix</cell>
                        <cell>По</cell>
                        <cell>потопѣ</cell>
                        <cell/>
                        <cell>трие</cell>
                        <cell>сынове</cell>
                        <cell>Ноеви</cell>
                        <cell>раздѣлиша</cell>
                     </row>
                     <row role="data">
                        <cell>Ost </cell>
                        <cell>По</cell>
                        <cell>потопѣ</cell>
                        <cell/>
                        <cell>3-е</cell>
                        <cell>сынове</cell>
                        <cell>Ноеви</cell>
                        <cell>раздѣлиша</cell>
                     </row>
                  </table>
               </div>
               <div>
                  <head>Punctuation (orthographic)</head>
                  <p>If what is important to the researcher is to distinguish simply the presence or
                     absence of punctuation, but not the form it takes, different punctuation tokens
                     may be neutralized as a single, generic punctuation token. If punctuation is
                     treated during tokenization as part of the word that precedes it, the
                     punctuation marks may be compared literally, as they appear in the manuscripts,
                     in which case, for example, a word followed by a comma will not match the same
                     word followed by a dot. Alternatively, trailing punctuation can be normalized
                     so as to be ignored during alignment, as in the example below from the <title
                        rend="italic">Rus′ primary chronicle</title>. Here in the fourth token
                     column the Rad and Xle readings differ not only in the second letter, but also
                     in the trailing punctuation, which is a dot in Rad and a comma in Xle:</p>
                  <table>
                     <head>Variation in punctuation in the <title rend="italic">Rusʹ primary
                           chronicle</title></head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>Lav</cell>
                        <cell>инд&lt;иꙗ&gt;|</cell>
                        <cell>по</cell>
                        <cell>ефратъ</cell>
                        <cell>рѣку.</cell>
                        <cell/>
                        <cell>вавилонъ.</cell>
                        <cell>кордуна.</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>Tro</cell>
                        <cell>индия</cell>
                        <cell>по</cell>
                        <cell>ефратъ</cell>
                        <cell>реку</cell>
                        <cell/>
                        <cell>вавилонъ</cell>
                        <cell>кордуна</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>Rad</cell>
                        <cell>мидиꙗ.|</cell>
                        <cell>и</cell>
                        <cell>ефрат</cell>
                        <cell>река.</cell>
                        <cell>и</cell>
                        <cell>вавилон.</cell>
                        <cell>кордоуна.</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>Aka</cell>
                        <cell>мидиа.</cell>
                        <cell>и</cell>
                        <cell>ефратъ</cell>
                        <cell>река.|</cell>
                        <cell>и</cell>
                        <cell>вавилонъ.</cell>
                        <cell>кордѹна.</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>Ipa</cell>
                        <cell>мидиа.</cell>
                        <cell>и | </cell>
                        <cell>ефратъ</cell>
                        <cell>рѣку.</cell>
                        <cell>и</cell>
                        <cell>вавилонъ.|</cell>
                        <cell>кордуна.</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>Xle</cell>
                        <cell>медїа</cell>
                        <cell>и</cell>
                        <cell>ефратъ |</cell>
                        <cell>рѣка,</cell>
                        <cell>и</cell>
                        <cell>вавѵлѡн.</cell>
                        <cell>кордоуна.</cell>
                        <cell>и</cell>
                     </row>
                     <row role="data">
                        <cell>Byč</cell>
                        <cell>Мидия</cell>
                        <cell>по</cell>
                        <cell>Ефратъ</cell>
                        <cell>рѣку,</cell>
                        <cell/>
                        <cell>Вавилонъ,</cell>
                        <cell>Кордуна,</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>Šax</cell>
                        <cell>Мидия</cell>
                        <cell>и</cell>
                        <cell>Ефратъ</cell>
                        <cell>рѣка</cell>
                        <cell>и</cell>
                        <cell>Вавилонъ,</cell>
                        <cell>Кордуна,</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>Lix</cell>
                        <cell>Мидия</cell>
                        <cell>по</cell>
                        <cell>Ефратъ</cell>
                        <cell>рѣку,</cell>
                        <cell/>
                        <cell>Вавилонъ,</cell>
                        <cell>Кордуна,</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>α</cell>
                        <cell>Мидия</cell>
                        <cell>и</cell>
                        <cell>Ефратъ</cell>
                        <cell>рѣку</cell>
                        <cell>и</cell>
                        <cell>Вавилонъ,</cell>
                        <cell>Кордуна,</cell>
                        <cell/>
                     </row>
                  </table>
               </div>
               <div>
                  <head>Morphological (linguistic)</head>
                  <p>Morphological variation (e.g., inflectional endings that express number,
                     gender, case, tense, and other linguistic categories) may be neutralized before
                        <hi rend="bold">Alignment</hi>, similarly to what in corpus linguistics is
                     called <term>lemmatization</term> or <term>stemming</term>. In the following
                     example the present (Witness A) and future (Witnesses B and C) tense of the
                     verb <q>chevaucher</q> (<q>to ride</q>) might be normalized in order to obtain
                     an optimal alignment.</p>
                  <table>
                     <head>Morphological variation in <title rend="italic">Lancelot en
                        prose</title></head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>A</cell>
                        <cell>que</cell>
                        <cell>ge</cell>
                        <cell>chevauchoie</cell>
                        <cell>-</cell>
                        <cell>a</cell>
                        <cell>tot</cell>
                        <cell>mon</cell>
                        <cell>poir</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>que</cell>
                        <cell>je</cell>
                        <cell>chevalcheroie</cell>
                        <cell>-</cell>
                        <cell>a</cell>
                        <cell>tot</cell>
                        <cell>mon</cell>
                        <cell>pooir</cell>
                     </row>
                     <row role="data">
                        <cell>C</cell>
                        <cell>si</cell>
                        <cell>-</cell>
                        <cell>chevalcheroie</cell>
                        <cell>la</cell>
                        <cell>a</cell>
                        <cell>tot</cell>
                        <cell>mon</cell>
                        <cell>pooir</cell>
                     </row>
                  </table>
               </div>
               <div>
                  <head>Lexical (linguistic)</head>
                  <p>In Old French, <q>pas</q> and <q>mie</q> are negative particles that are used
                     together with the negation adverb <q>ne</q>. They have no textual characters in
                     common, but their meaning and syntactic pattern of use is the same, making them
                     candidates for alignment. In the following example, the alignment would be
                     sub-optimal without a normalization of the two forms, because <q>mie</q> would
                     be placed into the fourth token column, aligned with <q>on</q> in Witness B,
                     instead of in the fifth, aligned with <q>pas</q>.<note>This would happen
                        because, as noted above, in case of multiple alternatives regarded as
                        equivalent by the software, CollateX pushes the token to the left.</note> If
                     normalization is performed, the alignment is correct, as reflected in the table
                     below:</p>
                  <table>
                     <head>Lexical variation in <title rend="italic">Lancelot en
                        prose</title></head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>A</cell>
                        <cell>ne</cell>
                        <cell>-</cell>
                        <cell>doit</cell>
                        <cell>-</cell>
                        <cell>mie</cell>
                        <cell>atorner</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>ne</cell>
                        <cell>li</cell>
                        <cell>doit</cell>
                        <cell>on</cell>
                        <cell>pas</cell>
                        <cell>atorner</cell>
                     </row>
                     <row role="data">
                        <cell>C</cell>
                        <cell>ne</cell>
                        <cell>il</cell>
                        <cell>doit</cell>
                        <cell>-</cell>
                        <cell>pas</cell>
                        <cell>atorner</cell>
                     </row>
                  </table>
               </div>
               <div>
                  <head>Syntactic role (linguistic)</head>
                  <p>Consider the following hypothetical alignment example from the CollateX
                     development test suite:</p>
                  <table>
                     <head>Example of texts to be aligned</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Untokenized text </cell>
                     </row>
                     <row role="data">
                        <cell>A </cell>
                        <cell>I bought this glass because it matches those plates.</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>I bought those glasses.</cell>
                     </row>
                  </table>

                  <p>Alignment according to exact orthographic matching would align the two
                     instances of <q>those,</q> producing:</p>
                  <table>
                     <head>Problematic alignment of the example in Table 10</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>A</cell>
                        <cell>I</cell>
                        <cell>bought</cell>
                        <cell>this</cell>
                        <cell>glass</cell>
                        <cell>because</cell>
                        <cell>it</cell>
                        <cell>matches</cell>
                        <cell>those</cell>
                        <cell>plates.</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>I</cell>
                        <cell>bought</cell>
                        <cell/>
                        <cell/>
                        <cell/>
                        <cell/>
                        <cell/>
                        <cell>those</cell>
                        <cell>glasses.</cell>
                     </row>
                  </table>

                  <p>Some editors, though, might prefer to align the direct objects, that is, to
                     assign greater weight to the syntactic role than to string matching. This would
                     produce:</p>
                  <table>
                     <head>Preferable alignment of the example in Table 10</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>A</cell>
                        <cell>I</cell>
                        <cell>bought</cell>
                        <cell>this</cell>
                        <cell>glass</cell>
                        <cell>because</cell>
                        <cell>it</cell>
                        <cell>matches</cell>
                        <cell>those</cell>
                        <cell>plates.</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>I</cell>
                        <cell>bought</cell>
                        <cell>those</cell>
                        <cell>glasses.</cell>
                        <cell/>
                        <cell/>
                        <cell/>
                        <cell/>
                        <cell/>
                     </row>
                  </table>

               </div>
               <div>
                  <head>Language (linguistic)</head>
                  <p>In this section a segment of text from the Old Church Slavonic (OCS) <title
                        rend="italic">Codex Suprasliensis</title> is aligned with a reconstructed
                     Greek parallel text. Because the two languages use different scripts, no type
                     of orthographic normalization would improve string matching, but when a part of
                     speech identifier is used as the shadow normalization property of the tokens,
                     it allows for quite accurate alignment. In the following example, the second
                     gap in the OCS is aligned with a Greek definite article because Greek has
                     articles and OCS does not. The first gap reflects the presence of a personal
                     pronoun in the Greek that happens not to be in the OCS. The tokens that are
                     aligned in the two witnesses belong, pairwise, to the same parts of speech,
                     which is how CollateX knew where to position the gaps <ptr
                        target="#birnbaum2018"/>. </p>
                  <table>
                     <head>Multilingual alignment in the <title rend="italic">Codex
                           Suprasliensis</title> project</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>OCS</cell>
                        <cell>посълалъ</cell>
                        <cell/>
                        <cell>к</cell>
                        <cell>тебѣ</cell>
                        <cell>и҅скоусити</cell>
                        <cell/>
                        <cell>о̑усрь҆диѥ</cell>
                     </row>
                     <row role="data">
                        <cell>Greek</cell>
                        <cell>ἀπέσταλκέ</cell>
                        <cell>με</cell>
                        <cell>πρὸς</cell>
                        <cell>σέ</cell>
                        <cell>δοκιμάσαι</cell>
                        <cell>τὴν</cell>
                        <cell>πρόθεσίν</cell>
                     </row>
                  </table>

               </div>
            </div>
            <div>
               <head>Stage 3. Alignment</head>
               <p>No normalization is performed during the <hi rend="bold">Alignment</hi> stage of
                  the Gothenburg model, but the alignment engine has access to the results of the
                  normalization performed in the <hi rend="bold">Normalization</hi> stage, as well
                  as the less explicit normalization, described above, that happens during <hi
                     rend="bold">Transcription</hi> and <hi rend="bold">Tokenization</hi>. And, as
                  is explained below, because the output of <hi rend="bold">Analysis</hi> (Stage 4)
                  can be cycled back into another iteration of <hi rend="bold">Alignment</hi>, <hi
                     rend="bold">Alignment</hi> also may have access to normalization performed
                  during <hi rend="bold">Analysis</hi>.</p>
               <p>The input into the <hi rend="bold">Alignment</hi> stage of CollateX is a set of
                  token sequences, one sequence for each witness, where, as described above, each
                  token has a t property, which represents a transcription of its reading according
                  to the witness, and an n property, which represents a normalized version of the t
                     property.<note>The researcher may also attach other properties, in addition to
                     t and n, to tokens, and these can be returned as part of the output. Properties
                     other than n are not used by the default alignment engine inside CollateX,
                     but, in keeping with the modularity of the Gothenburg model, a researcher could
                     replace the default CollateX alignment routine with an alternative that
                     performs a more complex evaluation. For example, it is theoretically possible
                     to perform several types of normalization, assign their results to different
                     token properties, and perform the alignment in a way that assigns different
                     weights to different degrees of matching in different token properties.</note>
                  As described above, Normalization is not limited to orthography; the researcher
                  may create, as the n property, any surrogate for the token that will facilitate
                  alignment. The alignment engine then uses only the n property to try to find the
                  best alignment of the tokens, that is, the alignment that corresponds most closely
                  to what a human philologist would produce.<note>Philologists may disagree about
                     which of two possible alignments to prefer, and some such decisions have no
                     clearly defined resolution. As a simplified example, given <q>very
                        interesting</q> and <q>very, very interesting</q>, there is no clear way to
                     decide whether to align the single instance of <q>very</q> in the first witness
                     with the first matching instance in the second witness, or with the second
                     matching instance.</note>
               </p>
               <p>Alignment is, to be sure, a more complicated process than simply aligning the
                  tokens with matching n values. On the one hand, in a text of any meaningful
                  length, word tokens are likely to repeat, which means that alignment must
                  associate the correct instances of each value, evaluating and making decisions
                  about many-to-one or many-to-many correspondences. For example, in the following
                  alignment table:</p>
               <table>
                  <head>Correct alignment of repeated word token <q>la</q></head>
                  <row role="label">
                     <cell>Witness </cell>
                     <cell>Tokens </cell>
                  </row>
                  <row role="data">
                     <cell>A</cell>
                     <cell>la</cell>
                     <cell>ou</cell>
                     <cell>il</cell>
                     <cell>conquist</cell>
                     <cell>par</cell>
                     <cell>sa</cell>
                     <cell>chevalerie</cell>
                  </row>
                  <row role="data">
                     <cell>B</cell>
                     <cell/>
                     <cell>ou</cell>
                     <cell>il</cell>
                     <cell>conquist</cell>
                     <cell>la</cell>
                     <cell>grant</cell>
                     <cell>hautesce</cell>
                  </row>
                  <row role="data">
                     <cell>C</cell>
                     <cell>la</cell>
                     <cell>ou</cell>
                     <cell>il</cell>
                     <cell>conquist</cell>
                     <cell>la</cell>
                     <cell>grant</cell>
                     <cell>hatece</cell>
                  </row>
               </table>
               <p>“la” (token columns 1 and 5) occurs once each in witnesses A and B (in different
                  locations) and twice in witness C. The alignment engine matches these instances
                  correctly because, although it operates at a certain level with individual tokens,
                  it also knows about sequences of vertically aligned <term>blocks.</term> This
                  means that it has access to the context when deciding which instances of repeated
                  tokens in one witness to align with which instances of matching tokens in the
                  others.</p>
               <p>Repetition (that is, the occurrence of multiple instances of the same token) is a
                  well-known challenge for collation, and Andrews (2020) proposes an unusual
                  normalization strategy as a way of meeting that challenge. Andrews observes that
                  when a computational process aligns very common words because they are
                  string-equal, they often do not belong to corresponding textual units, which both
                  produces misalignments and increases the computation complexity of the collation
                     process:<note>Repetition, when the same token appears multiple times in the
                     witnesses, makes it difficult to determine which instances in one witness to
                     align with which instances in another. The greater the extent of the
                     repetition, the greater the risk of incorrect alignment.</note>
                  <cit><quote rend="block">Common readings keep being treated as equivalent, just
                        because they are identical! This often throws off the collation of more
                        substantive words, because the collator doesn’t know how to tell what is
                        substantive or not. In a long text with many witnesses, this can throw off a
                        collation [... and ...] the collation algorithm was taking longer and
                        longer.</quote><ptr target="#andrews2020"/></cit> To avoid the spurious
                  alignment of tokens that, although string-equal, are so common that their
                  correspondence is more likely to be accidental than philologically meaningful,
                  Andrews replaces these common tokens, as well as punctuation tokens, with random
                  strings during <hi rend="bold">Normalization</hi>. As a consequence of this random
                  replacement, the normalized surrogates do not agree with anything during <hi
                     rend="bold">Alignment</hi>, and therefore do not complicate or confuse that
                  process. This enables substantive agreements to govern the alignment, improving
                  both the quality of the output and the speed of the operation (Andrews reports
                  that it <quote rend="inline">made the process run 4–5 times faster</quote>).</p>
               <p>On the other hand, the alignment engine may also choose to align non-matching
                  tokens that fall in the same position, as in the fourth position in the example
                  below. </p>
               <table>
                  <head>Forced match in the alignment</head>
                  <row role="label">
                     <cell>Witness </cell>
                     <cell>Tokens </cell>
                  </row>
                  <row role="data">
                     <cell>A</cell>
                     <cell>lors</cell>
                     <cell>abat</cell>
                     <cell>la</cell>
                     <cell>destre</cell>
                     <cell>manicle</cell>
                     <cell>de</cell>
                     <cell>son</cell>
                     <cell>hauberc</cell>
                  </row>
                  <row role="data">
                     <cell>B</cell>
                     <cell>lors</cell>
                     <cell>abat</cell>
                     <cell>la</cell>
                     <cell>senestre</cell>
                     <cell>manicle</cell>
                     <cell>de</cell>
                     <cell>son</cell>
                     <cell>hauberc</cell>
                  </row>
                  <row role="data">
                     <cell>C</cell>
                     <cell>lors</cell>
                     <cell>abat</cell>
                     <cell>la</cell>
                     <cell>senestre</cell>
                     <cell>manicle</cell>
                     <cell>de</cell>
                     <cell>son</cell>
                     <cell>hauberc</cell>
                  </row>
               </table>
               <p>The token alignment in seven of the eight columns reflects exact string equality
                  across all witnesses, but <q>destre</q> (right) and <q>senestre</q> (left), in the
                  fourth token column, are not an exact match. This alignment is forced by the
                  context: given that the <q>la</q> tokens before and the <q>manicle</q> tokens
                  afterwards are aligned as exact matches in all witnesses, the tokens between them
                  fall into alignment by default, a situation we call a <term>forced
                     match</term>.<note>A forced match and an alignment based on shared n property
                     values are represented differently in the variant graph that CollateX produces.
                     That difference is not translated into the alignment table output, but it is
                     part of the CollateX SVG rendering of the graph.</note>
                  <hi rend="bold">Alignment</hi> is further complicated by token transposition,
                  where tokens in one order in one witness may need to be aligned with tokens in a
                  different order in another witness, while nonetheless retaining each witness’s
                  token order. </p>
               <p>Finally, the algorithms most commonly used to implement alignment routines, such
                  as Needleman-Wunsch <ptr target="#needleman1970"/> or the Dekker <ptr
                     target="#dekker2011"/> algorithm in CollateX, reduce the computational
                  complexity of the alignment process only at a cost to the philological integrity
                  of the output:
               <cit><quote rend="block">These two algorithms follow the principle of <q>progressive alignment,</q> which
                  means that they do not compare all witnesses at the same time. Instead, they first
                  compare two witnesses, store the result of that comparison in a so-called variant
                  graph and then progressively compare another witness against that graph, at every
                  turn merging the result of that comparison into the graph until all witnesses are
                  merged. This progressive alignment method reflects the idea of <q>dynamic
                     programming,</q> which takes a complicated problem and breaks it down in
                  smaller sub-problems. In this case, the complicated task of multi-witness
                  alignment is broken down into a repetition of the relatively easier task of
                  two-witness alignment. A downside of progressive alignment is that, apparently,
                  the order in which the witnesses are compared influences the final result. That
                  is, the final alignment of three witnesses A, B, and C may differ if Witness C is
                  compared against the variant graph of Witness A and B, or if Witness B is compared
                  against the variant graph of Witness A and C.
               </quote><ptr target="#bleeker2017" loc="95"/></cit>
              The preceding means that progressive alignment cannot be considered an optimal
                  strategy for multiple witness collation, since the philologically optimal
                  alignment of the witnesses obviously cannot logically depend on the order in which
                  the philologist (or algorithm) looks at them <ptr target="#spadini2017" loc="348"
                     />.<note>In situations where researchers are able to hypothesize about the
                     relationships among the witnesses before performing machine-assisted collation,
                     they can exploit order dependencies of the algorithms by comparing the two
                     witnesses that are assumed to be most closely related first. This approach will
                     fail, though, in situations where different witnesses may be closest in
                     different portions in the work. Additionally, and more generally, confidence
                     about which witnesses are most closely related is proportional to the extent to
                     which the collation has already been completed, a circular workflow that
                     effectively amounts to requiring collation as a precursor to deciding how to
                     implement the collation. CollateX has limited control over the order in which
                     witnesses are incorporated into a collation. It can add witnesses one by one in
                     an order specified by the researcher, but it cannot compare, for example,
                     witnesses A and B and, separately, C and D, and then merge the two (A+B, C+D)
                     interim variant graphs. That is, after comparing the first two witnesses and
                     producing an initial variant graph, CollateX always aligns exactly one new
                     witness at a time against the variant graph, and it cannot align a new witness
                     directly against another new witness or an interim variant graph directly
                     against another interim variant graph.</note> At the same time, the
                  computational complexity of comparing all witnesses to all other witnesses
                  simultaneously means that optimal order-independent multiple-witness alignment is
                  currently an unsolved problem in computer science,<note>For summary descriptions
                     of the prevalent algorithms see <ref
                        target="https://en.wikipedia.org/wiki/Multiple_sequence_alignment"
                        >https://en.wikipedia.org/wiki/Multiple_sequence_alignment</ref>. </note>
                  and progressive alignment algorithms, despite their limitations, represent the
                  current state of the art.</p>
               <p>Alignment algorithms typically test only for exact matches between tokens, because
                  looking for the closest but inexact match is prohibitively expensive
                  computationally. For that reason, the CollateX implementation of near matching,
                  which we describe below, is performed in the <hi rend="bold">Analysis</hi> stage and then recycled into
                  another instance of <hi rend="bold">Alignment</hi>. What is important in this discussion of Stage 3 is
                  that, to the extent that near matching can be said to entail a type of
                  normalization, it happens not in Stage 2, but, rather, in Stage 4, after an
                  initial alignment has already been constructed.</p>
            </div>
            <div>
               <head>Stage 4. Analysis and interpretation (n)</head>
               <p>The 2009 meeting that led to the elaboration of the Gothenburg method did not
                  produce a final report or white paper, and in the absence of an authoritative
                  definition, the type of analysis that occurs at Stage 4 has been interpreted
                  variously as both computational postprocessing and manual, human intervention
                  between the output of <hi rend="bold">Alignment</hi> (Stage 3) and the input into
                     <hi rend="bold">Visualization</hi> (Stage 5). In situations where the alignment
                  is sub-optimal and cannot be repaired algorithmically, human intervention becomes
                  necessary. The cost of this intervention with respect to the workflow, though, is
                  that it introduces changes into a generated interim artifact, rather than into the
                     <term>base view</term>, that is, the transcribed witness files that serve as
                  the initial input into the processing pipeline. Any changes introduced manually
                  beyond the base view mean that if the collation operation must be re-run from the
                  beginning (for example, if a new witness is added, or if editorial decisions about
                  word division change), the manual effort is lost, and must be repeated. In other
                  words, manual intervention creates a new, derived base view, one that can no
                  longer be generated from the original base view entirely by the computational
                  pipeline.</p>
               <p>The purpose of the <hi rend="bold">Analysis</hi> in Stage 4 in the Gothenburg
                  model, whether automated or implemented manually, is described clearly in the
                  CollateX documentation. In the case of sub-optimal output from the <hi rend="bold">Alignment</hi>
                  stage: <cit><quote rend="block">[a]n additional [...] analysis of the alignment
                        result [...] may alleviate that deficiency by introducing the possibility of
                        a feedback cycle, in which users edit the alignment and feed their knowledge
                        back into the alignment process for another run delivering enhanced results.<note>CollateX documentation is available at <ref
                        target="https://collatex.net/doc/"
                        >https://collatex.net/doc/#analysis-feedback</ref>.</note></quote></cit> Postprocessing
                  performed at Stage 4, in addition to possibly improving the eventual alignment,
                  may also be used to infer knowledge from the information added to the original
                  texts through the first three Stages of the collation pipeline. That is, the end
                  products of a full collation pipeline, from <hi rend="bold">Tokenization</hi> through <hi rend="bold">Visualization</hi>,
                  may include not only a critical edition with variation, but also, for example,
                  summary analytic reports about the tradition, whether textual (e.g., in
                  statistical tables) or graphic (e.g., in charts or diagrams).</p>
               <p>In this section we identify two types of computational analysis, both involving
                  normalization, that are located at Stage 4 of the Gothenburg model. These are 1)
                  near matching, 2) the analysis of patterns of agreement within alignment sets.
                  Near matching is intended to improve the alignment, which is to say that the
                  output of this Stage 4 process, after modification for near matching, is fed back
                  into Stage 3 for realignment. The analysis of patterns of agreement is intended to
                  enrich and customize the collation output, and is fed into Stage 5 (<hi
                     rend="bold">Visualization</hi>). Especially in light of the absence of clear
                  guidance in the literature about the Gothenburg model concerning the <hi rend="bold">Analysis</hi>
                  stage, the two types of computational analysis discussed here should be regarded
                  as only some of the possibilities available at this stage of the collation
                  pipeline.</p>
               <div>
                  <head>Near matching</head>
                  <p>The term <q>near matching</q>, sometimes called <q>approximate</q> or
                        <q>fuzzy</q> matching, refers to the identification of the closest match for
                     alignment purposes, specifically in situations where there is no exact
                        match.<note>For a previous implementation of near matching, see <ref
                           target="#robinson1989" loc="103">Robinson (1989), 103</ref>.</note> As
                     was noted above, when CollateX compares the n properties of tokens to create an
                     initial alignment, it looks only for exact string matches. That is, to align a
                     token it asks not <q>what is the closest match?,</q> but, rather, <q>is there
                        an exact match?</q> The computational complexity of checking for the closest
                     match is sufficiently greater than the complexity of checking for an exact
                     match that it would not be realistic to perform an entire alignment operation
                     by computing all closest matches in all alignment positions. Furthermore, as
                     noted earlier in the context of forced matches, as long as the alignment
                     process finds a sufficient number of exact matches, a large number of inexact
                     matches are likely to be forced into proper alignment anyway.</p>
                  <p>A situation that is susceptible to incorrect alignment involves the following
                     two features:</p>
                  <list type="ordered">
                     <item>One witness has fewer tokens than another, which means that there will be
                        a gap in the alignment, where a token in the longer witness does not have a
                        corresponding token in the shorter one.</item>
                     <item>There is a token in the shorter witness that is adjacent to the gap and
                        that does not have an exact match in any of the alignment positions where it
                        could be placed.</item>
                  </list>
                  <p>When both of the preceding conditions are met, an alignment engine that relies
                     on exact matching is unable to decide where to position the gap, that is,
                     whether to push a token with two or more possible inexact adjacent alignment
                     points to the left or to the right. It is at this stage that near matching can
                     be used to resolve the uncertainties, and because the number of comparisons for
                     this type of limited, strategically targeted near matching is exponentially
                     less demanding computationally than what would be required to perform near
                     matching on all tokens during the initial alignment, it does not impose an
                     unacceptable delay.</p>
                  <p>As an example, if we need to align <q>The gray koala</q> with <q>The grey
                        koala</q> (note the different spellings of the color adjective) and we have
                     not normalized one of the spellings to the other, the color words will
                     nonetheless wind up aligned correctly because they represent a forced match
                     between the perfect matches of <q>The</q> to the left and <q>koala</q> to the
                     right. But suppose we have to align <q>The gray koala</q> with <q>The fluffy
                        grey koala,</q> that is, suppose we have to decide whether to align
                        <q>gray</q> in the first witness with <q>fluffy</q> or with <q>grey</q> in
                     the second. Without near matching, CollateX has to guess, and its arbitrary
                     strategy is to push the token in question to the left, which will produce an
                     incorrect result:</p>
                  <table>
                     <head>Problematic alignment of similar word tokens</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>A</cell>
                        <cell>The</cell>
                        <cell>gray</cell>
                        <cell/>
                        <cell>koala</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>The</cell>
                        <cell>fluffy</cell>
                        <cell>grey</cell>
                        <cell>koala</cell>
                     </row>
                  </table>
                  <p>The alignment a scholar would prefer is:</p>
                  <table>
                     <head>Correct alignment of similar word tokens</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>A</cell>
                        <cell>The</cell>
                        <cell/>
                        <cell>gray</cell>
                        <cell>koala</cell>
                     </row>
                     <row role="data">
                        <cell>B</cell>
                        <cell>The</cell>
                        <cell>fluffy</cell>
                        <cell>grey</cell>
                        <cell>koala</cell>
                     </row>
                  </table>
                  <p>With near matching, however, the <hi rend="bold">Analysis</hi> stage could determine that
                        <q>gray</q> is more similar, in terms of its string value, to <q>grey</q>
                     than it is to <q>fluffy,</q> and recycle this information into a second,
                     targeted, <hi rend="bold">Alignment</hi> process that would position the token accordingly. This is
                     how near matching works in CollateX.<note>CollateX performs near matching not
                        by revising the alignment table directly, but by adjusting the rank of the
                        token in the variant graph, which, among other things, informs the layout of
                        the alignment table. This makes the adjusted alignment information available
                        at a lower level, and therefore also in output structures that do not use
                        the alignment table, such as graph or SVG output.</note>
                  </p>
                  <p>The following example of the use of near matching in the <hi rend="bold">Analysis</hi> stage to
                     correct misalignments in the <hi rend="bold">Alignment</hi> stage is from the
                        <title rend="italic">Rus′ primary chronicle</title> (3,5). In the table
                     below, which presents the output of CollateX without near matching, the last
                     token of Tro is misaligned. It is not string-equal to any token in either of
                     the last two columns (note the fourth letter of the word, which does not match
                     the fourth letter of the words in the last column of the other witnesses), so
                     CollateX arbitrarily pushes it to the left, even though it is a closer match
                     with the tokens to the right.</p>
                  <table>
                     <head>Incorrect alignment of similar word tokens in the <title rend="italic"
                           >Rus′ primary chronicle</title></head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell>Lav</cell>
                        <cell>гаръмати</cell>
                        <cell>тавріани.</cell>
                        <cell>сируфьꙗ.</cell>
                        <cell/>
                        <cell>фраци.</cell>
                     </row>
                     <row role="data">
                        <cell>Tro </cell>
                        <cell>гаръмати</cell>
                        <cell>тавриани</cell>
                        <cell>скуфиа</cell>
                        <cell>фраки</cell>
                        <cell/>
                     </row>
                     <row role="data">
                        <cell>Rad </cell>
                        <cell>сармати</cell>
                        <cell>таврилни</cell>
                        <cell>скоуфиа</cell>
                        <cell>и</cell>
                        <cell>фраци</cell>
                     </row>
                     <row role="data">
                        <cell>Aka </cell>
                        <cell>сармати.</cell>
                        <cell>таврїани</cell>
                        <cell>скѹфїа.</cell>
                        <cell>и</cell>
                        <cell>фраци</cell>
                     </row>
                     <row role="data">
                        <cell>Ipa</cell>
                        <cell>сармати. </cell>
                        <cell>тавриани.</cell>
                        <cell>скуфиꙗ. </cell>
                        <cell/>
                        <cell>фраци.</cell>
                     </row>
                     <row role="data">
                        <cell>Xle </cell>
                        <cell>сармати.</cell>
                        <cell>таврїани.</cell>
                        <cell>скѵфїа</cell>
                        <cell/>
                        <cell>фраци.</cell>
                     </row>
                     <row role="data">
                        <cell>Bych</cell>
                        <cell>Саръмати,</cell>
                        <cell>Тавриани, </cell>
                        <cell>Скуфиа,</cell>
                        <cell/>
                        <cell>Фраци,</cell>
                     </row>
                     <row role="data">
                        <cell>Shakh </cell>
                        <cell>Сармати, </cell>
                        <cell>Тавриани, </cell>
                        <cell>Скуфия, </cell>
                        <cell/>
                        <cell>Фраци,</cell>
                     </row>
                     <row role="data">
                        <cell>Likh </cell>
                        <cell>Саръмати, </cell>
                        <cell>Тавриани,</cell>
                        <cell>Скуфиа,</cell>
                        <cell/>
                        <cell>Фраци,</cell>
                     </row>
                     <row role="data">
                        <cell>Ost </cell>
                        <cell>Сармати,</cell>
                        <cell>Тавриани, </cell>
                        <cell>Скуфия,</cell>
                        <cell/>
                        <cell>Фраци,</cell>
                     </row>
                  </table>
                  <p>Near matching is optional in CollateX, and without it we get the output above.
                     If we turn on near matching, though, the last token in Tro is moved to the
                     right because it is a closer match to tokens in the right column than to any in
                     the left. </p>
                  <p>Near matching in CollateX uses the python-Levenshtein package to calculate
                     similarity, which means that it does not have access to variation information
                     beyond the character level, such as lexical substitution. A small Levenshtein
                     distance can identify situations where scribes might have misread or miswritten
                     individual letters and introduced a small, local corruption. But scribes might
                     also intervene consciously to replace one entire word with another, and in such
                     situations, the Levenshtein distance between the words is not necessarily a
                     useful measure of <emph>editorial</emph> distance, that therefore also not
                     necessarily a useful predictor of the optimal alignment. In such cases, a
                     comparison metric that did not rely solely on Levenshtein character edit
                     distance, and that also had access to other properties, might achieve a
                     philologically preferable alignment. </p>
               </div>
               <div>
                  <head>Patterns of agreement</head>
                  <p>Normalization is ultimately a decision about when objectively different written
                     forms should be deemed equivalent <emph>for a specific purpose</emph>. At the
                        <hi rend="bold">Alignment</hi> stage (whether initially or with recycled
                     input after near matching, as described above), that specific purpose is to
                     determine which tokens should be considered part of the same variant set,
                     whatever the category of variation. For subsequent philological research,
                     however, it is common to distinguish between <term>formal</term> and
                        <term>substantive</term> (sometimes called <term>insignificant</term> and
                        <term>significant</term>) variants, where only the latter are traditionally
                     regarded as useful for stemmatic purposes: <cit><quote rend="block">philological
                        judgement is deployed to distinguish <q>significant</q> from
                           <q>insignificant</q> textual variation — that is, to select those
                        variants that are more or less likely to betray information about the
                        exemplar from which a given text was copied.<note>Another way to regard this issue
                           emerges from the diasystemic nature of manuscript evidence, where (to
                           simplify) features of the text are transmitted through the filter of
                           features of the manuscript (that is, scribal practice) <ptr
                              target="#segre1976"/>. During this transmission, the features
                           traditionally considered significant (e.g., lexical) are those that are not
                           affected subconsciously by scribal practice or habit as easily as those
                           traditionally considered insignificant (e.g., orthography, although this,
                           too, is a simplification). From this perspective, insofar as scribal norms
                           or habits may be independent of the content of the text being copied,
                           focusing on significant variation may be considered a strategy for
                           prioritizing features of the text over those of the manuscript. Andrews 2016
                           challenges the traditional division of features into significant and
                           insignificant, finding that, in the artificial traditions she examined,
                           <quote rend="inline">human judgement was not significantly better than
                              random selection for choosing the variant readings that fit the stemma in
                              a text-genealogical pattern.</quote>
                           <ptr target="#andrews2016" loc="523"/></note></quote> 
                        <ptr target="#andrews2016" loc="523"/></cit></p>
                  <p>As described above, <hi rend="bold">Normalization</hi> for <hi rend="bold"
                        >Alignment</hi> purposes (Stage 2, feeding into Stage 3) in CollateX, by
                     default, writes a shadow value to be used during alignment into an n property
                     on the token. A user-designed replacement for the default normalization routine
                     built into Stage 2 could customize the modifications employed to create the n
                     property, and could also add other properties to the tokens — for example, in
                     addition to orthographic normalization in the n property, it might identify the
                     lemma, the part of speech, and the morphology, and write those into l, p, and m
                     properties. The built-in <hi rend="bold">Alignment</hi> stage of CollateX
                     ignores these other properties, but it passes them along the pipeline, which
                     means that they are accessible at later stages. Recalling that normalization is
                        <quote rend="inline">a decision about when objectively different written
                        forms should be deemed equivalent <emph>for a specific
                        purpose</emph>,</quote> we might perform a different type of normalization
                     (indeed, a form of interpretation) during <hi rend="bold">Analysis</hi> in
                     Stage 4, comparing tokens that have already been aligned with one another to
                     determine whether their l, p, and m properties (or some subset of those) agree.
                     This type of analysis would enable us to distinguish, within a variant set,
                     which readings agree completely (all properties), which agree in traditionally
                     significant properties but differ in insignificant ones (e.g., agree in l, p,
                     and m, but not n), and which differ in significant properties (e.g., disagree
                     in l, p, or m).<note>This approach is explored in Camps et al. 2019, for which
                        see also <ref target="https://github.com/CondorCompPhil/falcon"
                           >https://github.com/CondorCompPhil/falcon</ref>. </note>
                  </p>
                  <p>In the following example, the tokens <term>ge</term> and <term>je</term> both
                     have the same (hypothetical) substantive properties: l (lemma) = <q>je,</q> p
                     (part of speech) = <q>personal pronoun,</q> and m (morphology) = <q>first
                        person singular.</q> But because the original reading (t property) is
                     different, we deduce that the variant is orthographic or phonological, which we
                     consider formal, rather than substantive. Toward the end of the sentence,
                        <term>conois</term> is aligned with <term>fes</term>. In this case, their p
                     and m properties are equal (p = verb, m = present first person singular), but
                     the l properties carry different values, which means that the variant is
                     lexical.</p>
                  <table>
                     <head>Categorized variants</head>
                     <row role="label">
                        <cell>Witness </cell>
                        <cell>Tokens </cell>
                     </row>
                     <row role="data">
                        <cell role="label">A</cell>
                        <cell/>
                        <cell>ge</cell>
                        <cell>te</cell>
                        <cell>conois</cell>
                        <cell>mielz</cell>
                        <cell>que</cell>
                        <cell>tu</cell>
                        <cell>ne</cell>
                        <cell>conois</cell>
                        <cell>moi</cell>
                     </row>
                     <row role="data">
                        <cell role="label">B</cell>
                        <cell>Artu</cell>
                        <cell>je</cell>
                        <cell>te</cell>
                        <cell>conois</cell>
                        <cell>miels</cell>
                        <cell>que</cell>
                        <cell>tu</cell>
                        <cell>ne</cell>
                        <cell>fes</cell>
                        <cell>moi</cell>
                     </row>
                     <row role="data">
                        <cell role="label">Variant type</cell>
                        <cell/>
                        <cell>form. </cell>
                        <cell/>
                        <cell/>
                        <cell>form. </cell>
                        <cell/>
                        <cell/>
                        <cell/>
                        <cell>lex. </cell>
                        <cell/>
                     </row>
                  </table>
                  <p>This information could be passed along to the <hi rend="bold"
                        >Visualization</hi> stage for different purposes. For example, the output of
                     the <hi rend="bold">Visualization</hi> stage might be a critical apparatus
                     where variants are grouped according to whether they agree in substantive
                     properties, regardless of whether their n values coincide. Independently of the
                     way variants might be presented in a text intended for critical reading, tables
                     or graphic representations of the pairwise agreement in different types of
                     significant features among witnesses might let us hypothesize about textual
                     filiation and transmission. From this perspective, the output in the
                     <hi rend="bold">Visualization</hi> stage might include a report about textual relationships among
                     the witnesses, or a dendrogram representation of the result of agglomerative
                     clustering of the witnesses, as in a phylogenetic tree. Such an analytic
                     product is no less a visualization of textual relations than a reading view
                     that reproduces the full text with apparatus, even though it is an abstraction
                     of the relationships, and not an organized reproduction of the primary data.
                     Most importantly, as long as the pipeline is completely automated, different
                     scripts can be run or re-run in order to obtain different, but complementary
                     visualizations.</p>
               </div>
            </div>
            <div>
               <head>Stage 5. Visualization</head>
               <p>CollateX supports several output views of the variant graph, including an
                  alignment table (in plain text or HTML, as well as CSV and TSV), a graph (in SVG),
                  XML (both TEI and a generic XML that is suitable for subsequent transformation
                  with XSLT), and a JSON document. Of these built-in output views, only the JSON
                  output renders both the n property and all custom properties assigned during
                  earlier processing, which means that users can post-process the JSON output in
                  order to create custom visualizations. Some other built-in output formats are also
                  able to render more than one property value for a token. For example, two types of
                  SVG output are supported, one with just the n property value and the other with
                  the n value and, for each node in the variant graph, all associated t values and
                  the identifiers of the witnesses in which those readings are found, as in the
                  figure below (Fig. 2). In this graphic, the n value is made of the part of speech
                  and the lemma, and is written in bold in the top left cell for each node; the
                  corresponding t values are listed below, together with the identifiers of the
                  witnesses in which they can be found.</p>
               <figure>
                  <head>CollateX output SVG graph.</head>
                  <graphic url="resources/images/figure02.png"/>
               </figure>
            </div>
         </div>
         <div>
            <head>Conclusion</head>
            <div>
               <head>Revisiting the Gothenburg model and its CollateX implementation</head>
               <p>The Gothenburg model has advanced our understanding of the machine-assisted
                  analysis of textual variation in both conceptual and practical ways. The modular
                  nature of the model recognizes both the substantial independence of the five
                  stages and, at the same time, the extent to which they interact within a
                  processing pipeline. This modular conceptualization, in turn, has enabled modular
                  implementation, where an application such as CollateX incorporates hooks that
                  permit the user to modify one stage of the pipeline without having to interact
                  with the others except by continuing to observe the input and output
                  specifications built into the API. </p>
               <p>Meanwhile, with more than ten years of experience of the Gothenburg model behind
                  us, we now also recognize aspects of the model, and of its implementations, that
                  could benefit from further consideration. For example, <hi rend="bold"
                     >Visualization</hi> (Stage 5) might more transparently be called <q>Output</q>
                  or <q>Rendering.</q> The implementation of the model in CollateX is a good example
                  of this, since the richest output format supported, JSON, is intended not for
                  human visual consumption, but for automated post-processing. Additionally, in
                  CollateX the modular stages might have been implemented with even greater mutual
                  independence. For example, the output format (text table, HTML, SVG, JSON, etc.)
                  is specified as an argument to the function that performs the <hi rend="bold">Alignment</hi>, which
                  means that generating multiple output formats requires performing the same <hi
                     rend="bold">Alignment</hi> operation anew for each of them.<note>More strictly,
                     performing the <hi rend="bold">Alignment</hi> and generating the output are
                     already separate pipeline steps within CollateX, but the API does not expose
                     them individually.</note> The clearer understanding of the model that comes
                  from a decade of experience with the CollateX implementation suggests directions
                  for further development that could continue to enhance the benefits that are
                  already visible in both the modular nature of the Gothenburg model and its
                  implementation in the software.</p>
            </div>
            <div>
               <head>Revisiting normalization</head>
               <p>A uniquely broad and consequential insight in this retrospective context is that
                     <hi rend="bold">Normalization</hi> (Stage 2) and normalization (with a
                  lower-case <q>n</q>) are different, and normalization (small <q>n</q>) is broadly
                  and correctly distributed over several stages of the pipeline. <hi rend="bold"
                     >Normalization</hi> (Stage 2) means identifying string-level differences in
                  witness tokens that should be ignored for alignment purposes, but the point of
                  this report has been to explore the many ways in which normalization (small
                     <q>n</q>) is pervasive, affecting not only Stage 2, but also Stage 0 (<hi
                     rend="bold">Transcription</hi>), Stage 1 (<hi rend="bold">Tokenization</hi>),
                  and Stage 4 (<hi rend="bold">Analysis</hi>). The type of normalization that is
                  applied at these different stages emerges from a variety of considerations. For
                  example, the character-based transcription of handwritten sources in Stage 0
                  necessarily entails normalization because handwriting is infinitely variable, and
                  infinite variation cannot be represented usefully in character data. And the
                  normalization applied in Stage 4 to support near-matching in CollateX is an
                  accommodation to the intractable computational complexity of implementing
                  near-matching globally as part of the <hi rend="bold">Alignment</hi> stage. In
                  other words, despite the overall clarity, accuracy, and utility of the
                  modularization of the Gothenburg model, both scholars and developers benefit from
                  an awareness that normalization as part of a collation pipeline neither begins nor
                  ends with Stage 2, and also that normalization may be used in analysis and
                  reporting in situations that are independent of alignment.</p>
               <p>Figure 3, below, reproduces Figure 1, the earlier plot of the five stages of the
                  Gothenburg model of textual collation, with the addition of an explicit
                  Transcription (or digitization) stage between the Input (the documentary artifact)
                  and <hi rend="bold">Tokenization</hi>, which serves as Stage 1 of the original
                  model. Additionally, in this revised figure <emph>all</emph> of the stages that
                  involve normalization are highlighted in green: </p>
               <figure>
                  <head>Gothenburg model (with Transcription)</head>
                  <graphic url="resources/images/figure03.png"/>
               </figure>
               <p>This figure represents a pipeline, corresponding to an editorial workflow. Despite
                  our identification in this report of a broad distribution of normalization
                  operations beyond the titular <hi rend="bold">Normalization</hi> Stage, the
                  modular pipeline architecture, including that stage, remains fundamental both as a
                  way of modeling the collation process and as a guide for implementation. Indeed,
                  the existence of an official <hi rend="bold">Normalization</hi> Stage between <hi
                     rend="bold">Tokenization</hi> and <hi rend="bold">Alignment</hi>, where
                  differences among witness tokens are neutralized in order to enable their
                  comparison for alignment purposes, provides a context for recognizing and
                  understanding the aspects of normalization that necessarily take place elsewhere.
               </p>
            </div>
         </div>
      </body>
      <back>
         <listBibl>
            <bibl xml:id="andrews2016" label="Andrews 2016">Andrews, Tara L. 2016. <title
                  rend="quotes">Analysis of variation significance in artificial traditions using
                  Stemmaweb</title>. <title rend="italic">Digital scholarship in the
                  humanities</title> 31, no. 3 (2016): 523–39. <ref
                  target="https://doi.org/10.1093/llc/fqu072"
                  >https://doi.org/10.1093/llc/fqu072</ref>. </bibl>
            <bibl xml:id="andrews2020" label="Andrews 2020">Andrews, Tara L. 2020. <title
                  rend="quotes">Abusing the concept of normalization for better collation results
               (and profit)</title>. <ref target="https://hcommons.org/deposits/item/hc:31925"
                     >https://hcommons.org/deposits/item/hc:31925</ref>. </bibl>
            <bibl xml:id="bleeker2017" label="Bleeker 2017">Bleeker, Elli. 2017. <title
                  rend="quotes">Mapping invention in writing: digital Infrastructure and the role of
                  the editor</title>. PhD diss., University of Antwerp. <ref
                     target="https://repository.uantwerpen.be/docman/irua/e959d6/155676.pdf">https://repository.uantwerpen.be/docman/irua/e959d6/155676.pdf</ref>. </bibl>
            <bibl xml:id="birnbaum2015" label="Birnbaum 2015">Birnbaum, David J. 2015. <title
                  rend="quotes">CollateX normalization</title>. Presented at the <title
                  rend="quotes">Computer-supported collation with CollateX</title> workshop, DH2015,
               Sydney. <ref
                  target="https://github.com/DiXiT-eu/collatex-tutorial/blob/master/unit7/soundex-normalization.pdf"
                  >https://github.com/DiXiT-eu/collatex-tutorial/blob/master/unit7/soundex-normalization.pdf</ref>.</bibl>
            <bibl xml:id="birnbaum2018" label="Birnbaum and Eckhoff 2018">Birnbaum, David J. and
               Hanne Martine Eckhoff. 2018. <title rend="quotes">Machine-assisted multilingual
                  alignment of the <title rend="italic">Codex Suprasliensis</title></title>, in
               Stephen M. Dickey and Mark Richard Lauersdorf, eds, <title rend="italic">V zeleni
                  drželi zeleni breg. Studies in honor of Marc L. Greenberg</title>, 1–14.
               Bloomington, IN: Slavica.</bibl>
            <bibl xml:id="camps2019" label="Camps et al. 2019">Camps, Jean-Baptiste, Lucence Ing,
               and Elena Spadini. 2019. <title rend="quotes">Collating medieval vernacular texts.
                  Aligning witnesses, classifying variants</title>. In <title rend="italic">DH2019
                  Digital humanities conference 2019</title>. Utrecht, Netherlands. <ref
                  target="https://hal.archives-ouvertes.fr/hal-02268348"
                  >https://hal.archives-ouvertes.fr/hal-02268348</ref>.</bibl>
            <bibl xml:id="collatex" label="CollateX Doc">CollateX — software for collating textual
               sources. Documentation. <ref target="https://collatex.net/doc/"
                  >https://collatex.net/doc/</ref> (Java version) and <ref
                  target="https://github.com/interedition/collatex/blob/master/docs/pythonport.md"
                  >https://github.com/interedition/collatex/blob/master/docs/pythonport.md</ref>
               (Python version).</bibl>
            <bibl xml:id="fv" label="FV n.d.">
               <title rend="italic">Frankenstein variorum.</title>
               <ref target="https://frankensteinvariorum.github.io/viewer/"
                  >https://frankensteinvariorum.github.io/viewer/</ref>. </bibl>
            <bibl xml:id="froger1968" label="Froger 1968">Froger, Jacques. 1968. <title
               rend="italic">La critique des textes et son automatisation.</title>. Coll. <title rend="quotes">Initiation aux nouveautés de la science</title> n° 7. Paris, Dunod.</bibl>
            <bibl xml:id="gilbert1973" label="Gilbert 1973">Gilbert, Penny. 1973. <title
                  rend="quotes">Automatic collation: a technique for medieval texts</title>. <title
                     rend="italic">Computers and the humanities</title> 7: 139–47. <ref target="https://www.jstor.org/stable/30199534">https://www.jstor.org/stable/30199534</ref>.</bibl>
            <bibl xml:id="gilbert1979" label="Gilbert 1979">Gilbert, Penny. 1979. <title
                  rend="quotes">The preparation of prose-text editions with the ‘Collate’
                  System</title>. In <title rend="italic">La Pratique des ordinateurs dans la
                  critique des textes</title>, 245–54. Paris: Ed. du C. N. R. S.</bibl>
            <bibl xml:id="dekker2011" label="Dekker and Middell 2011">Haentjens Dekker, Ronald and
               Gregor Middell. 2011. <title rend="quotes">Computer-supported collation with
                  CollateX. Managing textual variance in an environment with varying
                  requirements</title>. Paper presented at the meeting of Supporting Digital
               Humanities 2011, Copenhagen. In Bente Maegaard, ed., <title rend="italic">Supporting
                  digital humanities, Copenhagen 17–18 November 2011: conference
                  proceedings.</title>
            </bibl>
            <bibl xml:id="dekker2015" label="Dekker et al. 2015">Haentjens Dekker, Ronald, Dirk van
               Hulle, Gregor Middell, Vincent Neyt, and Joris van Zundert. 2015. <title
                  rend="quotes">Computer-supported collation of modern manuscripts: CollateX and the
                  Beckett Digital Manuscript Project</title>. <title rend="italic">Literary and
                  linguistic computing</title>, 30, no. 3 (1 September 2015): 452–70, <ref
                  target="https://doi-org.pitt.idm.oclc.org/10.1093/llc/fqu007"
                  >https://doi-org.pitt.idm.oclc.org/10.1093/llc/fqu007</ref>. </bibl>
            <bibl xml:id="interedition" label="Interedition">Interedition. <ref
                  target="http://www.interedition.eu/">http://www.interedition.eu/</ref>. </bibl>
            <bibl xml:id="mcilroy1964" label="McIlroy 1964">McIlroy, Douglas. 1964. <title
                  rend="quotes">Summary — what's most important</title>. <ref
                  target="http://doc.cat-v.org/unix/pipes/">http://doc.cat-v.org/unix/pipes/</ref>. </bibl>
            <bibl xml:id="needleman1970" label="Needleman and Wunsch 1970">Needleman, Saul B. and
               Christian D. Wunsch. 1970. <title rend="quotes">A general method applicable to the
                  search for similarities in the amino acid sequence of two proteins</title>. <title
                     rend="italic">Journal of molecular biology</title> 48 (3), 443–53. <ref target="https://doi.org/10.1016/0022-2836(70)90057-4">https://doi.org/10.1016/0022-2836(70)90057-4</ref>.</bibl>
            <bibl xml:id="nury2020" label="Nury and Spadini 2020">Nury, Elisa, and Elena Spadini.
               2020. <title rend="quotes">From giant despair to a new heaven: the early years of
                  automatic collation</title>. <title rend="italic">It - Information
                  Technology</title> 62 (2): 61–73. <ref
                  target="https://doi.org/10.1515/itit-2019-0047"
                  >https://doi.org/10.1515/itit-2019-0047</ref>.</bibl>
            <bibl xml:id="pvl" label="PVL n.d.">
               <title rend="italic">PVL. Povestʹ vremennyx let.</title>
               <ref target="http://pvl.obdurodon.org/">http://pvl.obdurodon.org/</ref>. </bibl>
            <bibl xml:id="robinson1993" label="Robinson and Solopova 1993">Robinson, Peter and
               Elizabeth Solopova. 1993. <title rend="quotes">Guidelines for transcription of the
                  manuscripts of <title rend="italic">The Wife of Bath’s prologue</title></title>,
               in Norman Blake and Peter Robinson, eds, <title rend="italic">The Canterbury Tales
                  Project occasional papers</title> I, 19–51. Oxford: Office for Humanities
               Communication. </bibl>
            <bibl xml:id="robinson1989" label="Robinson 1989">Robinson, P. M. W. 1989. <title
                  rend="quotes">The collation and textual criticism of Icelandic manuscripts (1):
                  collation</title>. <title rend="italic">Literary and linguistic computing</title>
               4 (2): 99–105. <ref target="https://doi.org/10.1093/llc/4.2.99"
                  >https://doi.org/10.1093/llc/4.2.99</ref>.</bibl>
            <bibl xml:id="rockwell1998" label="Rockwell and Bradley 1998">Rockwell, Geoffrey and
               John Bradley. 1998. <title rend="quotes">Eye-ConTact: towards a new design for
                  text-analysis tools</title>. <title rend="italic">Digital studies/Le Champ
                  numérique</title>, February. <ref target="https://doi.org/10.16995/dscn.232"
                  >https://doi.org/10.16995/dscn.232</ref>.</bibl>
            <bibl xml:id="secretlabs2001" label="Secret Labs 2001">Secret Labs. 2001. <title
                  rend="quotes">Secret Labs’ regular expression engine</title>. <ref
                  target="https://github.com/python/cpython/blob/3.6/Lib/re.py"
                  >https://github.com/python/cpython/blob/3.6/Lib/re.py</ref>. </bibl>
            <bibl xml:id="segre1976" label="Segre 1976">Segre, Cesare. 1976. <title rend="quotes"
                  >Critique textuelle, théorie des ensembles et diasystème</title>. <title
                  rend="italic">Bulletin de la classe des lettres et des sciences morales et
                     politiques de l’Académie Royale de Belgique</title> 62 (1976): 279–92. <ref target="https://www.persee.fr/doc/barb_0001-4133_1976_num_62_1_55259">https://www.persee.fr/doc/barb_0001-4133_1976_num_62_1_55259</ref>.</bibl>
            <bibl xml:id="sels2015" label="Sels and Birnbaum 2015">Sels, Lara and David J. Birnbaum.
               2015. <title rend="quotes">Editing the <title rend="italic">Bdinski sbornik</title>
                  as a multilayered reality</title>. In <title rend="italic">Агиославика. Проблеми и
                  подходи в изследването на Станиславовия чети-миней: доклади от едноименната
                  конференция - 21 май 2013 г. (Hagioslavica. Issues and approaches in the study of
                  the Stanislav Reading Menaion: presentations from the conference of May 21,
                  2013.)</title>, ed. Diana Atanasova. Sofia: Kliment Oxridski University, 2015
               (appeared in May 2016), 184–99.</bibl>
            <bibl xml:id="silva1969" label="Silva and Love 1969">Silva, Georgette, and Harold Love.
               1969. <title rend="quotes">The identification of text variants by Ccmputer</title>.
                  <title rend="italic">Information Storage and Retrieval</title> 5 (3): 89–108. <ref
                  target="https://doi.org/10.1016/0020-0271(69)90014-X"
                  >https://doi.org/10.1016/0020-0271(69)90014-X</ref>.</bibl>
            <bibl xml:id="soundex" label="Soundex">Soundex. <ref
                  target="https://en.wikipedia.org/wiki/Soundex"
                  >https://en.wikipedia.org/wiki/Soundex</ref>.</bibl>
            <bibl xml:id="spadini2016" label="Spadini 2016">Spadini, Elena. 2016. <title
                  rend="quotes">Studi sul <q>Lancelot en prose.</q></title> PhD diss., Sapienza
               Università di Roma. <ref target="http://hdl.handle.net/11573/1307347"
                  >http://hdl.handle.net/11573/1307347</ref>.</bibl>
            <bibl xml:id="spadini2017" label="Spadini 2017">Spadini, Elena. 2017. <title
                  rend="quotes">The role of the base manuscript in the collation of medieval
                  texts</title>, in <title rend="italic">Advances in digital scholarly editing.
                  Papers presented at the DiXiT conferences in the Hague, Cologne, and
                  Antwerp,</title> eds. Peter Boot, Anna Cappellotto, Wout Dillen, Franz Fischer,
               Aodhán Kelly, Andreas, Mertgens, Anna-Maria Sichani, Elena Spadini, and Dirk van
               Hulle. Leiden: Sidestone Press, pp. 345–49. <ref target="https://www.sidestone.com/books/advances-in-digital-scholarly-editing">https://www.sidestone.com/books/advances-in-digital-scholarly-editing</ref>.</bibl>
            <bibl xml:id="unicode2020" label="Unicode Consortium 2020">Unicode Consortium. 2020. The
               Unicode standard version 13.0 – core specification. Chapter 23, <title rend="quotes"
                  >Special areas and format characters</title>, 881–916 (esp. 885-86). <ref target="https://www.unicode.org/versions/Unicode13.0.0/ch23.pdf">https://www.unicode.org/versions/Unicode13.0.0/ch23.pdf</ref>.</bibl>
         </listBibl>
      </back>
   </text>
</TEI>
