<?xml version="1.0" encoding="utf-8"?>
<?oxygen RNGSchema="../../common/schema/DHQauthor-TEI.rng" type="xml"?>
<?oxygen SCHSchema="../../common/schema/dhqTEI-ready.sch"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0"
     xmlns:dhq="http://www.digitalhumanities.org/ns/dhq"
     xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
     xmlns:cc="http://web.resource.org/cc/">
   <teiHeader>
      <fileDesc>
         <titleStmt>
            <title>Mining Eighteenth Century Ontologies:
   Machine Learning and Knowledge Classification in the <title rend="italic">Encyclopédie</title> 
            </title>
            <author>Russell Horton</author>
            <dhq:authorInfo>
               <dhq:author_name>Russell
    <dhq:family>Horton</dhq:family>
               </dhq:author_name>
               <dhq:affiliation>Digital Library Development Center, University of Chicago</dhq:affiliation>
               <email>russ@diderot.uchicago.edu</email>
               <dhq:bio>
                  <p>Russell Horton is a research programmer at The ARTFL Project and the Digital Library Development Center at the University of Chicago, where he received his BA in Linguistics in 2002. He works on machine learning and text analysis software for the humanities.</p>
               </dhq:bio>
            </dhq:authorInfo>
            <author>Robert Morrissey</author>
            <dhq:authorInfo>
               <dhq:author_name>Robert
    <dhq:family>Morrissey</dhq:family>
               </dhq:author_name>
               <dhq:affiliation>University of Chicago</dhq:affiliation>
               <email>rmorriss@uchicago.edu</email>
               <dhq:bio>
                  <p>Robert Morrissey, who began teaching at the University of Chicago in 1978, earned his Ph.D. with honors in French literature in 1981. Morrissey specializes in 18th- and 19th-century French history, literature and critical theory. His work concentrates on themes and cultural currents over the longue durée. Professor Morrissey is the director of the ARTFL Project and editor of the online edition of Diderot and d'Alembert's Encyclopédie.</p>
               </dhq:bio>
            </dhq:authorInfo>
            <author>Mark Olsen</author>
            <dhq:authorInfo>
               <dhq:author_name>Mark
    <dhq:family>Olsen</dhq:family>
               </dhq:author_name>
               <dhq:affiliation>ARTFL Project, University of Chicago</dhq:affiliation>
               <email>markymaypo57@gmail.com</email>
               <dhq:bio>
                  <p>
      Mark Olsen is the Assistant Director of the ARTFL Project at the University of Chicago.  Mark received his Ph.D. in French history from the University of Ottawa in 1991 and has been involved in digital humanities and computer-aided text analysis since the mid-1980s.  His current ambition is to write a biography of the Marquis de Pastoret by candle-light with a quill.  
    </p>
               </dhq:bio>
            </dhq:authorInfo>
            <author>Glenn Roe</author>
            <dhq:authorInfo>
               <dhq:author_name>Glenn
    <dhq:family>Roe</dhq:family>
               </dhq:author_name>
               <dhq:affiliation>ARTFL Project, University of Chicago</dhq:affiliation>
               <email>glenn@diderot.uchicago.edu</email>
               <dhq:bio>
                  <p>Glenn Roe is a Ph.D. candidate in the Department of Romance Languages and Literatures and a Senior Project Manager at the ARTFL Project, both at the University of Chicago. Aside from his research interests in computer-assisted text analysis and knowledge classification during the French Enlightenment and 19th century, Mr. Roe's doctoral work is primarily concerned with the intersection of history and literature in the work of the French essayist and poet Charles Péguy.</p>
               </dhq:bio>
            </dhq:authorInfo>
            <author>Robert Voyer</author>
            <dhq:authorInfo>
               <dhq:author_name>Robert
    <dhq:family>Voyer</dhq:family>
               </dhq:author_name>
               <dhq:affiliation>Powerset</dhq:affiliation>
               <email>rlvoyer@diderot.uchicago.edu</email>
               <dhq:bio>
                  <p>Robert Voyer recently joined the ranks at Powerset as a computational linguist. Before joining the natural language search world, Robert worked as a research developer for The ARTFL Project at the University of Chicago, where he also earned his MS in Computer Science and BA in Romance Languages.</p>
               </dhq:bio>
            </dhq:authorInfo>
         </titleStmt>
         <publicationStmt>
            <idno type="DHQarticle-id">000044</idno>
            <idno type="volume">003</idno>
            <idno type="issue">2</idno>
            <dhq:articleType>article</dhq:articleType>
            <date when="2009-06-18">18 June 2009</date>
            <availability>
               <cc:License xmlns="http://digitalhumanities.org/DHQ/namespace"
                           rdf:about="http://creativecommons.org/licenses/by-nc-nd/2.5/"/>
            </availability>
         </publicationStmt>
         <sourceDesc>
            <p>Authored for DHQ; migrated from original DHQauthor format</p>
         </sourceDesc>
      </fileDesc>
      <encodingDesc>
         <classDecl>
            <taxonomy xml:id="dhq_keywords">
               <bibl>DHQ classification scheme; full list available in the <ref target="http://www.digitalhumanities.org/dhq/taxonomy.xml">DHQ keyword taxonomy</ref>
               </bibl>
            </taxonomy>
            <taxonomy xml:id="authorial_keywords">
               <bibl>Keywords supplied by author; no controlled vocabulary</bibl>
            </taxonomy>
         </classDecl>
      </encodingDesc>
      <profileDesc>
         <langUsage>
            <language ident="en"/>
         </langUsage>
      </profileDesc>
      <revisionDesc>
         <change when="2008" who="AR">Encoded document</change>
         <change who="Melanie Kohnen" when="2008-12-18">Added bios and teaser</change>
         <change who="Melanie Kohnen" when="2009-01-13">Added email addresses</change>
      </revisionDesc>
   </teiHeader>
   <text>
      <front>
         <dhq:abstract>
            <p>The <title rend="italic">Encyclopédie</title> of Denis Diderot and Jean le Rond d'Alembert was one of the most important and revolutionary intellectual products of the French Enlightenment. Mobilizing many of the great – and the not-so-great – <emph>
                  <foreign xml:lang="fr">philosophes</foreign>
               </emph> of the 18th century, the <title rend="italic">Encyclopédie</title> was a massive reference work for the arts and sciences, which sought to organize and transmit the totality of human knowledge while at the same time serving as a vehicle for critical thinking. In its digital form, it is a highly structured corpus; some 55,000 of its 77,000 articles were labeled with classes of knowledge by the editors making it a perfect sandbox for experiments with supervised learning algorithms. In this study, we train a Naive Bayesian classifier on the labeled articles and use this model to determine class membership for the remaining articles. This model is then used to make binary comparisons between labeled texts from different classes in an effort to extract the most important features in terms of class distinction. Re-applying the model onto the original classified articles leads us to question our previous assumptions about the consistency and coherency of the ontology developed by the Encyclopedists. Finally, by applying this model to another corpus from 18th century France, the <title rend="italic">Journal de Trévoux, or Mémoires pour l'Histoire des Sciences &amp; des Beaux-Arts</title>, new light is shed on the domain of Literature as it was understood and defined by 18th century writers. </p>
         </dhq:abstract>
         <dhq:teaser>
            <p>Examining the classification of knowledge in the Encyclopédie of Diderot and d'Alembert</p>
         </dhq:teaser>
      </front>
      <body>
         <head>Mining Eighteenth Century Ontologies:
   Machine Learning and Knowledge Classification in the <title rend="italic">Encyclopédie</title>.</head>
         <div>
            <head>Introduction </head>
            <p>One of the crowning achievements of the 18th-century Enlightenment was the <title rend="italic">Encyclopédie ou Dictionnaire raisonné des sciences, des arts et des métiers, par
    une société de gens de lettres</title>, edited by Diderot and d'Alembert. Published in Paris
    between 1751 and 1772, in 17 volumes of text and 11 volumes of plates, it contains some 77,000
    articles written by more than 140 contributors. The <title rend="italic">Encyclopédie</title>
    was a massive reference work for the arts and sciences, as well as a <emph>
                  <foreign xml:lang="fr">machine de guerre</foreign>
               </emph> that served to propagate Enlightenment ideas. The impact of the <title rend="italic">Encyclopédie</title> was enormous. Through its attempt to classify learning and to open all domains of human activity to its readers, the <title rend="italic">Encyclopédie</title> gave expression to many of the most important intellectual and social developments of its time.<note>For an exhaustive treatment of the Encyclopédie and its authors, see <ptr target="#schwab1971"/>. For a more general discussion of the work, see <ptr target="#proust1995"/>. The ARTFL implementation of the <title rend="italic">Encyclopédie</title> is discussed in <ptr target="#morrissey2001"/>, and <ptr target="#andreev1999"/>.</note>
            </p>
            <p>The scale and ambition of the <title rend="italic">Encyclopédie</title> inspired its editors
    to adopt three distinct modes of organization which, taken together, Diderot described as
    encyclopedic: dictionary, hierarchical classification, and the <emph>renvois</emph>
    (cross-references). The interaction of these three modes has led modern commentators to describe
    the <title rend="italic">Encyclopédie</title> as an <q>ancestor of hypertext</q> and
    to depict Diderot as <q>
                  <foreign xml:lang="fr">l'internaute d'hier</foreign>
               </q> (<q>websurfer <emph>
                     <foreign xml:lang="fr">avant la lettre</foreign>
                  </emph>
               </q>) <ptr target="#brian1998"/>. D'Alembert underscores the importance of the organization of knowledge in the <title rend="italic">Discours Préliminaire</title>:
    <quote rend="block">As an <title rend="italic">Encyclopedia</title>, it is to set forth the
     <emph>order</emph> and <emph>connection</emph> of the parts of human knowledge. As a <title rend="italic">Reasoned Dictionary of the Sciences, Arts, and Trades</title>, it is to contain
     the general principles that form the basis of each science and each art...and the most
     essential facts that make up the body and substance of each.<note>English translation cited in <ptr target="#hoyt1965" loc="xxiii"/> (our emphasis).</note>
               </quote>
    Of the three modes of organization, the dictionary mode (organization of entries in alphabetical
    order) is certainly the simplest and the most arbitrary. The second mode of organization is
    classification, wherein each dictionary entry is assigned to a <q>class of
    knowledge,</q> placing it within the <q>order</q> of human understanding, as
    depicted in the <title rend="italic">Système Figuré des connaissances humaines</title>. Modeled
    after Bacon's classification of knowledge and Enlightenment theories of epistemology, all
    understanding is founded upon memory, reason, or imagination, with numerous categories and
    sub-categories branching out from these three faculties.<note>For various representations of the
     <title rend="italic">Système Figuré</title> and the Editors' descriptions, see <ref target="http://www.lib.uchicago.edu/efts/artfl/projects/encyc/texts/">http://www.lib.uchicago.edu/efts/ARTFL/projects/encyc/texts/</ref> and <ref target="http://artfl.uchicago.edu/cactus/">http://artfl.uchicago.edu/cactus/</ref>.</note> However, simply placing an entry into
    this hierarchy of knowledge was insufficient to indicate the interconnections of knowledge.
    Thus, Diderot created an extensive system of <emph>
                  <foreign xml:lang="fr">renvois</foreign>
               </emph>,
    or cross-references, the third mode of organization, providing a lattice of interconnections
    between individual leaves of the tree as well as between classes of knowledge.<note>Blanchard and Olsen examined the structure of the <emph>renvois</emph> generating a <q>mappemonde</q> of the cross-references and node level classes of knowledge. See <ptr target="#blanchard2002"/>.</note>
            </p>
            <p>The central role of the classification system in the intellectual objectives of the <title rend="italic">Encyclopédie</title> editors is indicated by the extent to which it has been
    discussed and debated by both contemporary scholars and later researchers. The editors were
    remarkably diligent in assigning classes of knowledge to each article and sub-article. Of the
    77,085 main and sub articles, we have identified 55,248 as having classes of
    knowledge.<note>Classes of knowledge were originally extracted automatically using a simple
     rule-based identifier, written in Perl, based on typographic conventions in the text. A small
     number of classified articles remain either misclassified or unclassified altogether.</note>
    The editors were, however, somewhat less diligent in maintaining a precisely controlled list.
    Thus the classifications as found in the text are an amalgam of abbreviations, conflations, and
    singular categories that are not found on the <title rend="italic">Système Figuré</title> at all. We have recently completed orthographic normalization of the classes of knowledge assigned to each article,<note>The normalized classifications are the result of a collaborative project with Professor Dena Goodman, Kevin Hawkins, and Benjamin Heller at the University of Michigan.</note> resulting in some 54,289 articles with 2,620 normalized classes of knowledge. The twenty most frequent classifications by number of articles are:
   <table>
                  <head>Original counts of top 20 most frequent classifications by number of articles</head>
                  <row>
                     <cell>Géographie</cell>
                     <cell>5513</cell>
                  </row>
                  <row>
                     <cell>Géographie moderne</cell>
                     <cell>4794</cell>
                  </row>
                  <row>
                     <cell>Géographie ancienne</cell>
                     <cell>3084</cell>
                  </row>
                  <row>
                     <cell>Jurisprudence</cell>
                     <cell>2396</cell>
                  </row>
                  <row>
                     <cell>Grammaire</cell>
                     <cell>2304</cell>
                  </row>
                  <row>
                     <cell>Marine</cell>
                     <cell>1894</cell>
                  </row>
                  <row>
                     <cell>Commerce</cell>
                     <cell>1483</cell>
                  </row>
                  <row>
                     <cell>Histoire naturelle. Botanique</cell>
                     <cell>1277</cell>
                  </row>
                  <row>
                     <cell>Histoire moderne</cell>
                     <cell>1194</cell>
                  </row>
                  <row>
                     <cell>Mythologie</cell>
                     <cell>1115</cell>
                  </row>
                  <row>
                     <cell>Histoire naturelle</cell>
                     <cell>1069</cell>
                  </row>
                  <row>
                     <cell>Histoire ancienne</cell>
                     <cell>889</cell>
                  </row>
                  <row>
                     <cell>Medecine</cell>
                     <cell>796</cell>
                  </row>
                  <row>
                     <cell>Architecture</cell>
                     <cell>730</cell>
                  </row>
                  <row>
                     <cell>Jardinage</cell>
                     <cell>689</cell>
                  </row>
                  <row>
                     <cell>Littérature</cell>
                     <cell>682</cell>
                  </row>
                  <row>
                     <cell>Maréchallerie</cell>
                     <cell>627</cell>
                  </row>
                  <row>
                     <cell>Botanique</cell>
                     <cell>614</cell>
                  </row>
                  <row>
                     <cell>Histoire ecclésiastique</cell>
                     <cell>614</cell>
                  </row>
                  <row>
                     <cell>Théologie</cell>
                     <cell>517</cell>
                  </row>
               </table> 
    
    Like the <title rend="italic">Système Figuré</title>, these classifications are a reflection of how knowledge was ordered and classified in the 18th century. This paper reports the results of experiments using machine learning and data mining techniques to understand and exploit this unique resource.</p>
         </div>
         <div>
            <head>Preliminary Objectives</head>
            <p>Our initial experimental objectives were threefold:
    <list type="ordered">
                  <item>To train our classifier on all of the normalized classified articles of the <title rend="italic">Encyclopédie</title> and then to apply the resulting model to the 22,796 unclassified articles, labeling each with a predicted class of knowledge.</item>
                  <item>Using the same model, we then sought to reclassify all of the classified articles, verifying the relative success of our classifier by examining the misclassified instances and then exploring the relationships of discrete classes of knowledge to one another through feature set analysis.</item>
                  <item>Finally, we wanted to apply the classification model generated from the <title rend="italic">Encyclopédie</title> to other 18th century texts, allowing us to evaluate the applicability of this ontology to sources outside of the encyclopedic enterprise.</item>
               </list>
            </p>
         </div>
         <div>
            <head>Methodology</head>
            <p>Before we could begin any classification tasks we needed to preprocess the data into a usable
    format.  To this end, we extracted the text from all articles and sub-articles in the <title rend="italic">Encyclopédie</title> with a normalized class of knowledge. Explicit markers of
    class membership, present at the beginnings of these articles (e.g., <hi rend="italic">Comm.</hi> for <hi rend="italic">Commerce</hi>, <hi rend="italic">Phys.</hi> for
    <hi rend="italic">Physics</hi>, etc.) were removed to ensure that they would not provide facile
    criteria for classification. The texts were then tokenized and lemmatized<note>Lemmatization was
     accomplished using TreeTagger, a probabilistic part-of-speech tagger and lemmatization utility
     developed by Helmudt Schmidt under the auspices of the TC Project at the University of
     Stuttgart. It is freely available for download from the project's official homepage at <ref target="http://www.ims.uni-stuttgart.de/projekte/corplex/treetagger/">http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/</ref>.</note> automatically, and frequencies of words and lemmas were computed both globally and for each article. Words and lemmas were then used as attributes, and vectors for each article were generated from the number of occurrences of each attribute in that article.</p>
            <p>Text categorization is an extensively studied subfield of information science and accordingly,
    there exist many time-tested classification algorithms, each with their own strengths and
    weaknesses. We chose to implement a Multinomial Naive Bayesian classifier because of its
    simplicity and efficacy on large corpora.<note>See <ptr target="#hand2001"/>.</note> Multinomial Naive Bayes (MNB) treats
    documents as bags of words, where word order is considered irrelevant. Essentially, we measure
    the likelihood of words occurring in a given class by calculating how many times they occur in
    all documents with known classes. These conditional probabilities allow us to guess the most
    <q>probable</q> class of an unclassified article based on the frequency of the words
    that occur within it. MNB assumes that the probability of a word occurring within a document is independent of the words that occur around it, an assumption that we know to be false. Yet, despite this, MNB is known to perform very well on large data sets and in fact, has been shown to produce more accurate results than many other classification algorithms.<note>See <ptr target="#witten2005"/>.</note>
            </p>
            <p>We designed a test bed using this implementation that allowed for easy modification of
    several parameters including the minimum word count required for an article to be included in
    our model, as well as the minimum and maximum number of occurrences across the entire corpus
    required for a word or lemma to be included in our vocabulary. Massaging the data in this way,
    we can avoid, as much as possible, skewed results caused by high frequency function words or
    highly weighted words occurring in a relatively small number of articles. After several rounds
    of experimentation we found that our classifier was most successful when excluding articles of 25 words or fewer and words (lemmas) occurring in less than 4 articles.<note>Success can be gauged in various ways. The accuracy achieved when testing our classifier against the same set of classified articles on which it was trained is generally indicative of the Naive Bayesian algorithm's power. However, this sort of validation often leads to over-fitting. Given that our accuracy never exceeded 75%, we can only assume that there are enough anomalies in the data to have compensated for any over-fitting. Additionally, we believe that it is these anomalies that often end up being the most revelatory.</note>
            </p>
         </div>
         <div>
            <head>Results</head>
            <div>
               <head>Classifying the Unclassified</head>
               <p>Our first classification task was to train the classifier on the 54,289 articles that were assigned categories of knowledge by the editors of the <title rend="italic">Encyclopédie</title> and to then apply this model to the 22,796 unclassified articles in an attempt to predict class membership for the articles in question. Once classified, the twenty most frequent labels for the newly classified articles were:
    <table>
                     <head>Counts of top 20 most frequent classifications for previously unclassified articles.</head>
                     <row>
                        <cell>Jurisprudence</cell>
                        <cell>4276</cell>
                     </row>
                     <row>
                        <cell>Art méchanique</cell>
                        <cell>1260</cell>
                     </row>
                     <row>
                        <cell>Géographie</cell>
                        <cell>828</cell>
                     </row>
                     <row>
                        <cell>Commerce</cell>
                        <cell>802</cell>
                     </row>
                     <row>
                        <cell>Anatomie</cell>
                        <cell>643</cell>
                     </row>
                     <row>
                        <cell>Marine</cell>
                        <cell>557</cell>
                     </row>
                     <row>
                        <cell>Histoire moderne</cell>
                        <cell>475</cell>
                     </row>
                     <row>
                        <cell>Architecture</cell>
                        <cell>435</cell>
                     </row>
                     <row>
                        <cell>Grammaire</cell>
                        <cell>402</cell>
                     </row>
                     <row>
                        <cell>Histoire naturelle. Ornithologie</cell>
                        <cell>367</cell>
                     </row>
                     <row>
                        <cell>Medecine</cell>
                        <cell>363</cell>
                     </row>
                     <row>
                        <cell>Géographie moderne</cell>
                        <cell>347</cell>
                     </row>
                     <row>
                        <cell>Art militaire</cell>
                        <cell>311</cell>
                     </row>
                     <row>
                        <cell>Histoire ecclésiastique</cell>
                        <cell>308</cell>
                     </row>
                     <row>
                        <cell>Géométrie</cell>
                        <cell>306</cell>
                     </row>
                     <row>
                        <cell>Géographie ancienne</cell>
                        <cell>306</cell>
                     </row>
                     <row>
                        <cell>Musique</cell>
                        <cell>295</cell>
                     </row>
                     <row>
                        <cell>Astronomie</cell>
                        <cell>274</cell>
                     </row>
                     <row>
                        <cell>Histoire naturelle. Botanique</cell>
                        <cell>266</cell>
                     </row>
                     <row>
                        <cell>Théologie</cell>
                        <cell>215</cell>
                     </row>
                  </table>
     
     While this distribution of classes bears some resemblance to the overall distribution cited
     above, we have no real way of verifying the accuracy of the classifier given the unknown
     content/classes of the unclassified articles.<note>The accuracy of a classifier refers to its ability to correctly predict the label of an unseen instance. Typically, classifier accuracy is estimated using a test set that is independent of the training set, one that is often generated by hand-labeling a set of unseen documents and measuring how often the result of the classifier corresponds to the class given by the human tagger. However, asking modern scholars to classify a large number of articles by hand would necessarily involve not only a large amount of effort, but also a fair amount of guesswork, particularly given the fact that we are dealing with an ontology that is more than 200 years old.</note> While it is entirely plausible that 19% of the unclassified articles are concerned with Jurisprudence in a general sense, it is also likely that Jurisprudence (which represents only 5% of the classified articles) becomes of sort of <q>catch-all</q> category for the classifier into which articles concerned with any specific aspect of law (i.e. <emph>droit romain</emph>, <emph>droit canonique</emph>, <emph>droit civil</emph>, etc.) are grouped.</p>
               <p>The sample of results we examined reveal that the classifier performed reasonably well. By
     this, we mean that some classifications seemed right on; some made a good degree of sense,
     while others were perhaps a bit too general, failing to accurately represent the specificity of
     the subject matter. Naturally, the more than 22,000 newly generated classifications could not all be
     verified by hand, and so we focused on major articles and a selection of smaller ones. We were
     particularly encouraged by the assigned classifications for the 10 longest unclassified
     articles. The <title rend="italic">Discours Préliminaire</title>, d'Alembert's famous preface
     to the <title rend="italic">Encyclopédie</title> detailing the intellectual underpinnings of
     the enterprise, comes back as belonging to the class Philosophy.<note>This classification
      coincides nicely with the conclusions drawn by Martine Groult, one of our collaborators at the CNRS, whose work on the philosophy of <title rend="italic">Discours préliminaire</title> is summarized here: <ref target="http://encyclopedie.uchicago.edu/?q=node/162">http://encyclopedie.uchicago.edu/?q=node/162</ref>.</note>  Going down the list we see that the article <title rend="quotes">Anatomie</title> is assigned its own classification in Anatomy and <title rend="quotes">Chimie</title> is rightly placed into Chemistry, results we had originally hoped were easy enough for the algorithm to attain. Most of the classifications, however, don't fall into such clear categories. For example, <title rend="quotes">Venerie</title> — the art of hunting —  was assigned to Natural History;  the philosophical article <title rend="quotes">Eclectisme</title> to the History of Philosophy, etc. Indeed, while these and the better part of the predicted classifications can be justified on a general level, we had to concede that the overall utility of this task was somewhat questionable. Quite simply, as we mention above, there was just too much data to sort through. The new labels were often interesting, but we were not able to study them easily or thoroughly enough to come to any deeper understanding about how the <emph>
                     <foreign xml:lang="fr">philosophes</foreign>
                  </emph> structured knowledge or indeed how the <title rend="italic">Encyclopédie</title> itself fits together. Thus, trying to develop an experiment that could generate more legible results, we decided instead to leverage the information given us by the editors in exploring the known classifications and their relationship to each other and then later, to consider the classification scheme as a whole by examining the general distribution of classes over the entire work as opposed to individual instances.</p>
            </div>
            <div>
               <head>Classified vs. Classified — Feature set evaluation</head>
               <p>Having run a set of predictive classification experiments on the unclassified articles, our
     next task was an attempt at what we have named "comparative" classification, wherein we train
     the classifier on two particular classes, and then reclassify them in an effort to determine
     how separable they are and to ascertain which features best distinguish articles from the two
     sets.  The goal here has less to do with the accuracy of the classifications than with the
     feature sets that are generated during the classification task. Any two classes can be
     compared. Feature weights are generated using the Naive Bayes Perl module written by Ken Williams. These weights represent the conditional probability of a feature f given a class c and are generated based on their frequency in the known articles.<note>It is worth noting that Williams' implementation of the Naive Bayesian classifier uses the log-likelihood ratio of features <hi rend="italic">fi</hi> given each class c in question. It also uses a smoothing term to account for probabilities of zero. Because these probabilities are in general very small, their logarithmic weights are very negative. We find it easier to deal with positive numbers, so for display purposes, we make each of these scores positive by raising 2 to the power of that weight.  
      <graphic url="resources/images/figure01.jpg"/>
     Williams' Perl module can be downloaded at <ref target="http://search.cpan.org/~kwilliams/algorithm-naivebayes-0.04/lib/algorithm/naivebayes.pm">http://search.cpan.org/~kwilliams/Algorithm-NaiveBayes-0.04/lib/Algorithm/NaiveBayes.pm</ref>.</note> To give an example of this sort of comparative classification, 889 articles classified as <q>Histoire ancienne</q> were compared with 1194 articles in the <q>Histoire moderne</q> class and the following features and their weights were extracted as the most relevant in determining class membership:
    <table>
                     <head>Conditional probabilities of top 20 most common words in articles from <title rend="italic">Histoire ancienne</title>
                     </head>
                     <row>
                        <cell>étoient</cell>
                        <cell>0.04356</cell>
                     </row>
                     <row>
                        <cell>avoit</cell>
                        <cell>0.03705</cell>
                     </row>
                     <row>
                        <cell>romains</cell>
                        <cell>0.02472</cell>
                     </row>
                     <row>
                        <cell>avoyer</cell>
                        <cell>0.02455</cell>
                     </row>
                     <row>
                        <cell>an</cell>
                        <cell>0.02341</cell>
                     </row>
                     <row>
                        <cell>peuple</cell>
                        <cell>0.02271</cell>
                     </row>
                     <row>
                        <cell>chez</cell>
                        <cell>0.02188</cell>
                     </row>
                     <row>
                        <cell>sous</cell>
                        <cell>0.02182</cell>
                     </row>
                     <row>
                        <cell>tems</cell>
                        <cell>0.02170</cell>
                     </row>
                     <row>
                        <cell>mot</cell>
                        <cell>0.02146</cell>
                     </row>
                     <row>
                        <cell>empereur</cell>
                        <cell>0.02146</cell>
                     </row>
                     <row>
                        <cell>g</cell>
                        <cell>0.02122</cell>
                     </row>
                     <row>
                        <cell>appelloit</cell>
                        <cell>0.01880</cell>
                     </row>
                     <row>
                        <cell>premier</cell>
                        <cell>0.01847</cell>
                     </row>
                     <row>
                        <cell>nous</cell>
                        <cell>0.01802</cell>
                     </row>
                     <row>
                        <cell>encore</cell>
                        <cell>0.01783</cell>
                     </row>
                     <row>
                        <cell>après</cell>
                        <cell>0.01763</cell>
                     </row>
                     <row>
                        <cell>homme</cell>
                        <cell>0.01756</cell>
                     </row>
                     <row>
                        <cell>dieu</cell>
                        <cell>0.01683</cell>
                     </row>
                     <row>
                        <cell>rome</cell>
                        <cell>0.01683</cell>
                     </row>
                  </table>
                  <table>
                     <head>Conditional probabilities of top 20 most common words in articles from <title rend="italic">Histoire moderne</title>
                     </head>
                     <row>
                        <cell>roi</cell>
                        <cell>0.03744</cell>
                     </row>
                     <row>
                        <cell>ordre</cell>
                        <cell>0.02870</cell>
                     </row>
                     <row>
                        <cell>prince</cell>
                        <cell>0.02390</cell>
                     </row>
                     <row>
                        <cell>sous</cell>
                        <cell>0.02314</cell>
                     </row>
                     <row>
                        <cell>nommer</cell>
                        <cell>0.02180</cell>
                     </row>
                     <row>
                        <cell>titre</cell>
                        <cell>0.02165</cell>
                     </row>
                     <row>
                        <cell>empire</cell>
                        <cell>0.02065</cell>
                     </row>
                     <row>
                        <cell>chevalier</cell>
                        <cell>0.02046</cell>
                     </row>
                     <row>
                        <cell>officier</cell>
                        <cell>0.02039</cell>
                     </row>
                     <row>
                        <cell>tems</cell>
                        <cell>0.02030</cell>
                     </row>
                     <row>
                        <cell>étoient</cell>
                        <cell>0.02003</cell>
                     </row>
                     <row>
                        <cell>premier</cell>
                        <cell>0.01959</cell>
                     </row>
                     <row>
                        <cell>empereur</cell>
                        <cell>0.01903</cell>
                     </row>
                     <row>
                        <cell>porter</cell>
                        <cell>0.01891</cell>
                     </row>
                     <row>
                        <cell>état</cell>
                        <cell>0.01871</cell>
                     </row>
                     <row>
                        <cell>mot</cell>
                        <cell>0.01851</cell>
                     </row>
                     <row>
                        <cell>avoit</cell>
                        <cell>0.01823</cell>
                     </row>
                     <row>
                        <cell>prendre</cell>
                        <cell>0.01814</cell>
                     </row>
                     <row>
                        <cell>maître</cell>
                        <cell>0.01814</cell>
                     </row>
                     <row>
                        <cell>sans</cell>
                        <cell>0.01780</cell>
                     </row>
                  </table>
     
     The overall performance of the classifier came in at 95.63%, which tells us that while both belong to the same <q>branch</q> of science, namely History, the ancient and the modern are nonetheless significantly distinguishable from one another.  When considering the two lists of features, one immediately notices that the results make good sense, i.e., we find more verbs in the past tense (<emph>
                     <foreign xml:lang="fr">avoient</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">étoient</foreign>
                  </emph>, etc.) in the Ancient History articles as well as references to antiquity (<emph>
                     <foreign xml:lang="fr">romains</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">empereur</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">rome</foreign>
                  </emph>, etc.). The single-letter feature <q>g</q> is the signature of the Abbé Mallet who was the author of a significant portion of the articles on Ancient History. Interestingly, some of the features occur in both lists, which is more than likely a result of the close relationship and dialogue between the two classes. In the feature set for Modern History, we find references to forms of government that quite rightly belong to the modern period (<emph>
                     <foreign xml:lang="fr">état</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">prince</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">roi</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">chevalier</foreign>
                  </emph>, etc.) and the notable absence of the word <q>
                     <foreign xml:lang="fr">dieu</foreign>
                  </q> (<q>god</q>).</p>
               <p>We can also run comparative classification tasks on seemingly dissimilar classes of knowledge such as the 682 Literature articles and the 200 articles dealing with Physics. Not surprisingly, we obtain a very high rate of success for this sort of classification, in this case, 99.29%:
    <table>
                     <head>Conditional probabilities of top 20 most common words in articles from <title rend="italic">Littérature</title>.</head>
                     <row>
                        <cell>nous</cell>
                        <cell>0.03330</cell>
                     </row>
                     <row>
                        <cell>mot</cell>
                        <cell>0.02902</cell>
                     </row>
                     <row>
                        <cell>avoit</cell>
                        <cell>0.02607</cell>
                     </row>
                     <row>
                        <cell>étoient</cell>
                        <cell>0.02294</cell>
                     </row>
                     <row>
                        <cell>livre</cell>
                        <cell>0.02283</cell>
                     </row>
                     <row>
                        <cell>ancien</cell>
                        <cell>0.02149</cell>
                     </row>
                     <row>
                        <cell>je</cell>
                        <cell>0.02117</cell>
                     </row>
                     <row>
                        <cell>tems</cell>
                        <cell>0.02030 </cell>
                     </row>
                     <row>
                        <cell>bien</cell>
                        <cell>0.01997</cell>
                     </row>
                     <row>
                        <cell>encore</cell>
                        <cell>0.01963</cell>
                     </row>
                     <row>
                        <cell>sans</cell>
                        <cell>0.01956</cell>
                     </row>
                     <row>
                        <cell>vers</cell>
                        <cell>0.01907</cell>
                     </row>
                     <row>
                        <cell>dieu</cell>
                        <cell>0.01835</cell>
                     </row>
                     <row>
                        <cell>auteur</cell>
                        <cell>0.01820</cell>
                     </row>
                     <row>
                        <cell>latin</cell>
                        <cell>0.01785</cell>
                     </row>
                     <row>
                        <cell>usage</cell>
                        <cell>0.01742</cell>
                     </row>
                     <row>
                        <cell>devoir</cell>
                        <cell>0.01738</cell>
                     </row>
                     <row>
                        <cell>notre</cell>
                        <cell>0.01695</cell>
                     </row>
                     <row>
                        <cell>homme</cell>
                        <cell>0.01691</cell>
                     </row>
                     <row>
                        <cell>chose</cell>
                        <cell>0.01671</cell>
                     </row>
                  </table>
                  <table>
                     <head>Conditional probabilities of top 20 most common words in articles from <title rend="italic">Physique</title>
                     </head>
                     <row>
                        <cell>corps</cell>
                        <cell>0.05394</cell>
                     </row>
                     <row>
                        <cell>air</cell>
                        <cell>0.04216</cell>
                     </row>
                     <row>
                        <cell>eau</cell>
                        <cell>0.04185</cell>
                     </row>
                     <row>
                        <cell>nous</cell>
                        <cell>0.03507</cell>
                     </row>
                     <row>
                        <cell>froid</cell>
                        <cell>0.02597</cell>
                     </row>
                     <row>
                        <cell>chaleur</cell>
                        <cell>0.02584</cell>
                     </row>
                     <row>
                        <cell>monsieur</cell>
                        <cell>0.02580</cell>
                     </row>
                     <row>
                        <cell>degré</cell>
                        <cell>0.02296</cell>
                     </row>
                     <row>
                        <cell>moins</cell>
                        <cell>0.02251</cell>
                     </row>
                     <row>
                        <cell>fort</cell>
                        <cell>0.02242</cell>
                     </row>
                     <row>
                        <cell>glace</cell>
                        <cell>0.02196</cell>
                     </row>
                     <row>
                        <cell>couleur</cell>
                        <cell>0.02169</cell>
                     </row>
                     <row>
                        <cell>feu</cell>
                        <cell>0.02160</cell>
                     </row>
                     <row>
                        <cell>lorsque</cell>
                        <cell>0.02109</cell>
                     </row>
                     <row>
                        <cell>effet</cell>
                        <cell>0.02048</cell>
                     </row>
                     <row>
                        <cell>peu</cell>
                        <cell>0.01991</cell>
                     </row>
                     <row>
                        <cell>rayon</cell>
                        <cell>0.01986</cell>
                     </row>
                     <row>
                        <cell>fluide</cell>
                        <cell>0.01962</cell>
                     </row>
                     <row>
                        <cell>mouvement</cell>
                        <cell>0.01952</cell>
                     </row>
                     <row>
                        <cell>trouver</cell>
                        <cell>0.01948</cell>
                     </row>
                  </table>
               </p>
               <p>The feature scores from this model are what we would expect. The Literature class' most significant features are those words pertaining to language and grammar (<emph>
                     <foreign xml:lang="fr">mot</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">livre</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">vers</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">auteur</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">latin</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">usage</foreign>
                  </emph>, etc.) whereas the Physics class is dominated by materialistic terminology (<emph>
                     <foreign xml:lang="fr">corps</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">air</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">eau</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">degré</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">fluide</foreign>
                  </emph>, <emph>
                     <foreign xml:lang="fr">mouvement</foreign>
                  </emph>, etc.) consistent with the scientific writings of the period.</p>
               <p>Evaluation of these feature sets can be invaluable when testing certain hypotheses, such as
     word usage differences across similar disciplines or between authors. In this particular case,
     the features provide an intuitive illustration of the differences between these two distinct
     classes of knowledge. While features are simply terms that the algorithm finds statistically
     representative of a particular class, the feature sets as a whole can also give a snapshot of
     the make-up of the individual classes or indeed of larger concepts more generally. From the
     list for literature, for example, we get a sense of the importance that classical Roman authors still had for the <emph>philosophes</emph> with the terms <q>ancien</q> (<q>ancient</q>) and <emph>latin.</emph>  
                  <q>Vers</q> (<q>verse</q>) perhaps reflects the fact that versification was a predominant aspect of literary style, whether in poetic, dramatic, and other writing at that time. Terms such as <q>mot</q> (<q>word</q>) and <q>usage</q> (<q>use</q>) might point to the 18th century's expansive, belle-lettristic sense of literature that we discuss below. In a more general manner, feature sets provide us with an expanded thesaurus for any given classification task -- leads for further investigation and study -- that can then be exploited by a more traditional full text analysis system.<note>The importance of word usage and the evolution of language in the <title rend="italic">Encyclopédie</title>, key concepts when considering feature set analysis, are discussed in <ptr target="#anderson1984"/>.</note>
               </p>
            </div>
            <div>
               <head>Reclassifying the Classified — the Ontology of the Encyclopédie</head>
               <p>Finally, we applied the model assembled for our first experiment — trained on all of the known classifications — onto all of the already classified articles.  By this, we mean that we effectively ignored any given classes of knowledge, treating each article as if it were unclassified, and then assigned class membership using the algorithm described above.  Here our goal in the results analysis was twofold: first, we were curious as to the overall performance of our classification algorithm, i.e., how well it correctly labeled the known articles; and secondly, we wanted to use these new classifications to examine the outliers or misclassified articles in an attempt to understand better the presumed coherency and consistency of the editors' original classification scheme.</p>
               <p>We achieved a 71.4% success rate in the re-categorization of the 54,289 classified articles, a performance that could perhaps be improved with a more accurate morphological stemmer and the inclusion of n-grams as features, fucnctions we intend to implement in the future. Nonetheless, developing a model to reliably guess an article's given class of knowledge is ultimately not our primary concern as even a perfect model, while impressive in terms of performance, could only yield that which we already know, namely the assigned classes of knowledge. The sheer size and complexity of the <title rend="italic">Encyclopédie</title>, drawing its contents from hundreds of distinct writers, all but guarantees a lower rate of performance for any classification algorithm. This fact need not be discouraging however, as we are more interested in exploring the use of these text mining techniques as knowledge discovery tools, uncovering previously unnoticed connections and classifications, such as the particular use of the class <q>Literature</q> outlined below, rather than simply using these approaches as a statistical platform for hypothesis testing.</p>
               <p>The twenty most frequent classes after re-classification:  
    <table>
                     <head>The 20 most frequent classes of knowledge by number of articles after re-classification.</head>
                     <row>
                        <cell>Géographie</cell>
                        <cell>3926</cell>
                     </row>
                     <row>
                        <cell>Géographie ancienne</cell>
                        <cell>3492</cell>
                     </row>
                     <row>
                        <cell>Géographie moderne</cell>
                        <cell>3273</cell>
                     </row>
                     <row>
                        <cell>Jurisprudence</cell>
                        <cell>2552</cell>
                     </row>
                     <row>
                        <cell>Commerce</cell>
                        <cell>2104</cell>
                     </row>
                     <row>
                        <cell>Art méchanique</cell>
                        <cell>1662</cell>
                     </row>
                     <row>
                        <cell>Histoire naturelle. Botanique</cell>
                        <cell>1615</cell>
                     </row>
                     <row>
                        <cell>Marine</cell>
                        <cell>1575</cell>
                     </row>
                     <row>
                        <cell>Histoire moderne</cell>
                        <cell>1514</cell>
                     </row>
                     <row>
                        <cell>Mythologie</cell>
                        <cell>1334</cell>
                     </row>
                     <row>
                        <cell>Architecture</cell>
                        <cell>1213</cell>
                     </row>
                     <row>
                        <cell>Grammaire</cell>
                        <cell>1111</cell>
                     </row>
                     <row>
                        <cell>Histoire ancienne</cell>
                        <cell>1061</cell>
                     </row>
                     <row>
                        <cell>Histoire ecclésiastique</cell>
                        <cell>781</cell>
                     </row>
                     <row>
                        <cell>Medecine</cell>
                        <cell>746</cell>
                     </row>
                     <row>
                        <cell>Histoire naturelle</cell>
                        <cell>727</cell>
                     </row>
                     <row>
                        <cell>Littérature</cell>
                        <cell>646</cell>
                     </row>
                     <row>
                        <cell>Maréchallerie</cell>
                        <cell>592</cell>
                     </row>
                     <row>
                        <cell>Morale</cell>
                        <cell>573</cell>
                     </row>
                     <row>
                        <cell>Jardinage</cell>
                        <cell>566</cell>
                     </row>
                  </table>
               </p>
               <p>When comparing the results to the original classifications we note that the class
     <q>Grammar</q> falls out of the top ten while <q>Art méchanique,</q> which
     is not included in the original top twenty, ranks as the sixth most frequent class. The Grammar
     class is known to be problematic as Diderot frequently used this seemingly innocuous label to
     hide more polemical entries.<note>The use and abuse of the category <q>Grammar</q> by Diderot is one of the subjects treated in <ptr target="#leca-tsiomis1999"/>.</note> As for the <q>Art méchanique</q> category, we suspect that many of the overly specific classes dealing with the mechanical arts were subsumed into this larger, more inclusive set. By and large the rest of the classes are consistent with the overall distribution in the <title rend="italic">Encyclopédie</title> although the rankings differ slightly.</p>
               <p>The most interesting results here come from the examination of misclassified articles, which belie vocabularies that do not belong probabilistically to their assigned categories. Upon analyzing a random subset of the misclassified articles, we identified three distinct types of misclassifications. First, there are articles whose original classification was too infrequent; for example, the article <title rend="quotes">Accrues</title> (metal rings used to knit together fishing net) is the sole member of the class <q>Marchands de Filets</q> (net merchants) and was placed into the more general class of <q>Pêche</q> (fishing). There are also articles whose vocabularies mislead the classifier. One such case is the article <title rend="quotes">Achées</title> (a type of worm used in bait-fishing), originally classified as <q>Pêche,</q> it was later assigned to the class <q>Jardinage</q> (gardening). The article is in fact less a description of anything to do with fishing, but rather contains instructions on how to find and cultivate bait worms in a garden. Finally, there are entries whose predicted class, while incorrect, seems more logical than the original. The article <title rend="quotes">Tepidarium,</title> which describes an ancient Roman bathhouse, would appear to have more in common with its predicted class, Architecture, than the one assigned by the editors, namely Literature. Certainly our judgment that the predicted class is more appropriate than the original class of knowledge is biased by our modern epistemological paradigm, but this does not necessarily mean that the original system of classification was entirely consistent and coherent. Naturally then, applying our model onto other 18th century French texts should provide further insight into the power of the classifier and more importantly, into the ontology originally laid out by the <emph>philosophes</emph>.</p>
            </div>
            <div>
               <head>Classification outside of the Encyclopédie</head>
               <p>The <title rend="italic">Journal de Trévoux</title>, or <title rend="italic">Mémoires pour
    l'Histoire des Sciences &amp; des Beaux-Arts</title>, was one of the most influential 18th
     century French periodicals. A sort of literary/scholarly journal reviewing and commenting on a
     wide variety of contemporary publications, the <title rend="italic">Journal de Trévoux</title>
     dealt with almost every discipline of knowledge. Given the great variety of subject matter
     contained in this collection we felt it would be a natural choice for us to begin studying the
     relationship of the Encyclopédie ontology to other 18th century texts. Of course, the 18th
     century in France was a time of intellectual ferment and, as in most historical moments, there
     existed more than one approach to the classification of the known world. And, indeed, the <emph>philosophes</emph>' intellectual and political bent made their way of organizing ideas different from that of other thinkers, particularly the Jesuits who were behind the <title rend="italic">Journal de Trévoux</title>. Knowing this, we wanted to test the degree of overlap between the structure of knowledge in the <title rend="italic">Encyclopédie</title> and the <title rend="italic">Journal de Trévoux</title>, discovering in the process the commonalities, differences, and unique aspects of each. We hoped this experiment would give us a "slice of life" look at the intellectual milieu of the day, or at least provide an insight into the presumed differences in discourse between the two camps. To this end, we processed the ARTFL Project's 109 volumes of the <title rend="italic">Journal de Trévoux</title>, splitting them into 1,027 separate articles. Extending from 1751 to 1757, this collection covers the years during which the editors of the <title rend="italic">Journal</title> engaged in a fierce polemic with the encyclopédistes concerning the publication of the <title rend="italic">Encyclopédie</title>.<note>The ARTFL database of the <title rend="italic">Journal de Trévoux</title> can be found at <ref target="http://www.lib.uchicago.edu/efts/artfl/projects/trevoux">http://www.lib.uchicago.edu/efts/ARTFL/projects/trevoux</ref>.</note> Our previous model, trained on all of the classified <title rend="italic">Encyclopédie</title> articles, was thus applied to the Trévoux articles, assigning each with a predicted class of knowledge. The twenty most frequent assigned classes are listed below: 
     <table>
                     <head>The 20 most frequent classes of knowledge assigned to the Trévoux articles using the Encyclopédie model.</head>
                     <row>
                        <cell>Littérature</cell>
                        <cell>317</cell>
                     </row>
                     <row>
                        <cell>Morale</cell>
                        <cell>86</cell>
                     </row>
                     <row>
                        <cell>Géographie moderne</cell>
                        <cell>61</cell>
                     </row>
                     <row>
                        <cell>Théologie</cell>
                        <cell>54</cell>
                     </row>
                     <row>
                        <cell>Philosophie</cell>
                        <cell>50</cell>
                     </row>
                     <row>
                        <cell>Histoire moderne</cell>
                        <cell>46</cell>
                     </row>
                     <row>
                        <cell>Belles lettres</cell>
                        <cell>45</cell>
                     </row>
                     <row>
                        <cell>Astronomie</cell>
                        <cell>35</cell>
                     </row>
                     <row>
                        <cell>Métaphysique</cell>
                        <cell>30</cell>
                     </row>
                     <row>
                        <cell>Histoire ecclésiastique</cell>
                        <cell>26</cell>
                     </row>
                     <row>
                        <cell>Physique</cell>
                        <cell>22</cell>
                     </row>
                     <row>
                        <cell>Art militaire</cell>
                        <cell>18</cell>
                     </row>
                     <row>
                        <cell>Economie politique</cell>
                        <cell>18</cell>
                     </row>
                     <row>
                        <cell>Géographie</cell>
                        <cell>16</cell>
                     </row>
                     <row>
                        <cell>Medecine</cell>
                        <cell>16</cell>
                     </row>
                     <row>
                        <cell>Histoire romaine</cell>
                        <cell>14</cell>
                     </row>
                     <row>
                        <cell>Peinture</cell>
                        <cell>14</cell>
                     </row>
                     <row>
                        <cell>Histoire</cell>
                        <cell>14</cell>
                     </row>
                     <row>
                        <cell>Histoire naturelle</cell>
                        <cell>13</cell>
                     </row>
                     <row>
                        <cell>Chimie</cell>
                        <cell>12</cell>
                     </row>
                  </table>
               </p>
               <p>A cursory glance at these results gives us a general idea about the most significant themes found in the <title rend="italic">Journal de Trévoux</title>; themes that correspond nicely to our preconceived notions concerning the <title rend="italic">Journal</title>, its writers and subject matter. Along these lines, it is not surprising to find in a Jesuit publication such as this a greater emphasis on articles about Literature, Morality, Theology, and Philosophy. However, the surprising fact that more than 1/3 of the 1000 articles were assigned the label of <title rend="italic">Littérature</title> caused us to question somewhat the performance of the classifier and ultimately, to reconsider our modern notion of Literature when applied to the specific instances of this classification.</p>
               <p>In the first edition (1694) of the <title rend="italic">Dictionnaire de l'Académie française</title> the entry for <title rend="quotes">Littérature</title> reads thus: <quote rend="inline">
                     <foreign xml:lang="fr">Litterature. s. f. Erudition, doctrine. <hi rend="italic">Grande litterature. profonde litterature. il est homme de grande litterature. il n'a point de litterature. il a beaucoup de litterature.</hi>
                     </foreign>
                  </quote> and indeed the definition changes little by the fourth edition of 1762: <quote rend="inline">
                     <foreign xml:lang="fr">LITTÉRATURE. s.f. Érudition, doctrine. <hi rend="italic">Grande littérature. Profonde littérature. Il est homme de grande littérature. N'avoir point de littérature. Avoir beaucoup de littérature. Un ouvrage plein de littérature.</hi> Ce mot regarde proprement les Belles-Lettres.</foreign>
                  </quote> The addition of the last sentence, <quote rend="inline">This word is properly used in regard to Belles Lettres,</quote> in the 1762 edition seemingly restricts this particular form of erudition to the more traditionally literary realm of the <q>Belles-Lettres,</q> or Poetry and Rhetoric. The definition offered by the Jesuit editors of the <title rend="italic">Dictionnaire de Trévoux</title> (1742) differs only slightly from that of the Academy: <quote rend="inline">LITTÉRATURE, s. f. Doctrine, connoissance profonde des Lettres. <hi rend="italic">Doctrina, litteratura, eruditio</hi>.</quote> While these definitions shed little light as to why the <title rend="italic">Encyclopédie</title> literature class should be so prevalent in the classification of such a diverse collection of articles, many of which deal with the Sciences and Natural History, the ambiguity of this erudite possession of <q>
                     <foreign xml:lang="fr">littérature</foreign>
                  </q> and <q>
                     <foreign xml:lang="fr">lettres</foreign>
                  </q> should nonetheless cause us to broaden our understanding of these terms as they were used in the mid-18th century.</p>
               <p>We thus began a more thorough investigation of the Literature category by examining five randomly
     selected articles belonging to the assigned class <q>Littérature</q> in the <title rend="italic">Journal de Trévoux</title>. While some categorizations make sense as literature
     — e.g., the article <title rend="quotes">Nouvelles Litteraires,</title> a sort of literary
     <q>news of the day</q>; and, less convincingly, a commentary on Rousseau's first
     discourse — others have ostensibly nothing to do with our modern idea of Literature — e.g.,
     articles commenting on a history of jurisprudence, a treatise on diseases, and a compilation of treatises on Physics and Natural History<note>The full titles of the <q>Literature</q> articles we uncovered are as follows: <foreign xml:lang="fr">1) ARTICLE XII. NOUVELLES LITTERAIRES. 2) ARTICLE XXIX. DISCOURS QUI A REMPORTE' le prix à l'Académie de Dijon en l'année 1750, sur cette question proposée par la même Académie: Si le rétablissement des Sciences &amp; des Arts a contribué à épurer les moeurs. Par un Citoyen de Genève. 3) ARTICLE XXXV. HISTOIRE DE LA JURISPRUDENCE Romaine. 4) ARTICLE LXI. TRAITÉ DES MALADIES qu'il est dangereux de guérir. 5) ARTICLE XIV. RECUEIL DE DIFFERENS Traités de Physique &amp; d'Histoire Naturelle.</foreign>
                  </note>. This apparent anomaly necessarily leads us back to the <title rend="italic">Encyclopédie</title> and the articles belonging to the class of knowledge Literature, which serve as the basis for these class assignments.</p>
               <p>As we mentioned above concerning the reclassification of the article <title rend="quotes">Tepidarium,</title> there are a great many articles whose original
     classifications seem inappropriate. This phenomenon is all the more evident when examining the
     reclassification of the Literature articles, the majority of which deal more with Ancient
     History, Mythology, and Architecture than with accepted literary issues. Of the 682 Literature
     articles, 460 were written by the Chevalier de Jaucourt, author of more than 17,000 <title rend="italic">Encyclopédie</title> entries. Jaucourt is known to have borrowed extensively from other sources and thus, we attributed these inconsistencies to intellectual laziness, given the enormous number of articles for which he was responsible.<note>For a thorough discussion of Jaucourt's contributions to the <title rend="italic">Encyclopédie</title>, see <ptr target="#lough1973"/>.</note> Upon closer examination of the Literature class of knowledge however, this characterization proves unjust.</p>
               <p>Indeed, the article titled <title rend="quotes">Littérature</title> belongs not to its own
     class of knowledge, but rather to three seemingly unrelated and disparate classes: Sciences,
     <emph>Belles-Lettres</emph>, and Antiquity. The text of the article, written by Jaucourt, is in
     fact a polemic advocating a universal erudition and an expanded definition of what it means to possess a great literature — in a word to be literate. Jaucourt includes a <emph>renvoi</emph> to the article <q>Lettres</q> in an effort to define better this notion of Literature. Following the cross-reference we find that the article in question, <q>Lettres,</q> an article that normally falls innocuously amongst numerous similarly titled entries, is the sole member of the class <q>Encyclopédie,</q> suggesting that the idea of literacy is essential to the entire encyclopedic enterprise. Here, Jaucourt's understanding of <q>Lettres</q> as a much larger category of knowledge than <q>belles-lettres</q> or even the Humanities as a whole (<emph>les lettres humaines</emph>), harkens back to the Classical acceptation of the term which encompassed all areas of human understanding from Epic Poetry to Physics. The inter-connectedness of knowledge, both literary and scientific, is thus the essence of Jaucourt's idea of encyclopedic literacy, wherein <quote rend="inline">
                     <foreign xml:lang="fr">il en résulte que les <emph>lettres</emph> &amp; les sciences proprement dites, ont entr'elles l'enchaînement, les liaisons, &amp; les rapports les plus étroits; c'est dans l'<title rend="italic">Encyclopédie</title> qu'il importe de le démontrer</foreign>
                  </quote> (<quote rend="inline">the result is that Letters and Sciences, properly speaking, have between each other a strong and direct network of links and relationships; it is in the Encyclopaedia that the demonstration of this network becomes important</quote>).<note>From the article <title rend="quotes">Lettres</title> in the <title rend="italic">Encyclopédie</title>.</note>
               </p>
            </div>
         </div>
         <div>
            <head>Conclusions and Future Work</head>
            <p>This discovery — that for the writers of the <title rend="italic">Encyclopédie</title>, Literature as a branch of human understanding included not only what we today consider Literature (<emph>
                  <foreign xml:lang="fr">les Belles-Lettres</foreign>
               </emph>) but also Natural History, the Natural Sciences, the study of Antiquity, etc. — was made possible through the machine learning techniques outlined in this paper. We now understand precisely why the classifier, using the ontology of the <title rend="italic">Encyclopédie</title>, labels so many Trévoux entries as <q>Littérature.</q> Furthermore, the presence of this more inclusive view of lettered erudition should come as no surprise given that, as is expressed in the very title of the <title rend="italic">Encyclopédie</title>, this <title rend="italic">Dictionnaire raisonné</title> was the work of a society of <q>Gens de Lettres.</q>
            </p>
            <p>It would seem, however, that this notion of Literature as a sort of universal erudition did
    not survive the turmoil of the late 18th century, and by 1798 the fifth edition of the <title rend="italic">Dictionnaire de l'Académie française</title> had already begun restricting
    Literature to a purely literary domain, defined as the <quote rend="inline">
                  <foreign xml:lang="fr">Connoissance des ouvrages, des matières, des règles, et des exemples
     <emph>littéraires</emph>
                  </foreign>
               </quote> (our emphasis).<note>Note that we are focusing exclusively on the definitions of Literature as a discipline, rather than a collection of literary and/or other documents (as in <q>I have consulted all the available literature on cancer and have found nothing</q>). This modern usage of literature is expressed somewhat in the dictionaries we mention, i.e. <quote rend="inline">
                     <foreign xml:lang="fr">L'ensemble des productions littéraires d'une nation, d'un pays, d'une époque. La littérature française. La littérature du moyen âge</foreign>
                  </quote> (Émile Littré, <title rend="italic">Dictionnaire de la langue française</title> (1872-77).</note> 19th century lexicography would move further in this direction, eliminating any mention of <q>doctrine</q> or <q>erudition</q> in its definition of Literature as <quote rend="inline">
                  <foreign xml:lang="fr">La science qui comprend la grammaire, l'éloquence et la poésie, et qu'on appelle autrement Belles-lettres</foreign>
               </quote> (<title rend="italic">Dictionnaire de l'Académie française</title>, 6th edition 1832).  By 1872, perhaps as a reflection of the disciplinary codification which took place during the first part of the 19th century, the positivist lexicographer Émile Littré simply defines Literature as the <quote rend="inline">
                  <foreign xml:lang="fr">Connaissance des belles-lettres</foreign>
               </quote> which is by and large its accepted meaning today.</p>
            <p>These modest conclusions lend further weight to our initial view that traditional humanistic inquiry can be enhanced and broadened through the judicious application of machine learning and data mining techniques. As large-scale textual resources such as the <title rend="italic">Encyclopédie</title> become more readily available to scholars in a digitized format, new search and analysis tools will be needed. It is our opinion that approaches similar to those outlined above can successfully leverage the power of data mining tools for use in the Humanities. And while these techniques can certainly aid in a variety of hypothesis testing and classification tasks, it is our hope that they will also lead to the discovery of new knowledge through the uncovering of previously unnoticed textual connections.</p>
            <p>Moving forward, we plan to continue improving the performance of our classifiers through better morphological text extraction, allowing for a greater freedom in the selection of features. Possible features would include n-grams (bi- and tri-grams of surface forms, bi- and tri-grams of lemmas) as well as part of speech information. We are also planning to investigate several different unsupervised machine learning techniques such as vector space analysis, latent semantic indexing (LSI), and several other clustering models. When applied to the <title rend="italic">Encyclopédie</title> and other 18th century works, these tools will propose connections based on a measure of lexical similarity between arbitrary chunks of texts, whether paragraphs, articles, chapters, or entire works. It is our hope that within this new system, researchers will be able to explore and, more importantly, to evaluate the proposed connections between these articles and texts. Although the connections will be brought to light with the help of computers, it will nonetheless be necessary for scholars to provide the system with the critical element of human scrutiny that is essential to Humanities research.</p>
         </div>
      </body>
      <back>
         <listBibl>
            <bibl xml:id="anderson1984" label="Anderson 1984"> Anderson, Wilda. <title rend="italic">Between the Library and the Laboratory: The Language of Chemistry in Eighteenth-Century France</title>. Baltimore: Johns Hopkins, 1984.</bibl>
            <bibl xml:id="andreev1999" label="Andreev et al. 1999"> Andreev, Leonid, Jack Iverson, and Mark Olsen. <title rend="quotes">Re-engineering a War Machine: ARTFL's <title rend="italic">Encyclopédie</title>
               </title> in <title rend="italic">Literary and Linguistic Computing</title> 14:1 (1999), 11-28.</bibl>
            <bibl xml:id="blanchard2002" label="Blanchard and Olsen 2002"> Blanchard, Gilles, and Mark Olsen. <title rend="quotes">Le système de renvois dans l'Encyclopédie: une cartographie de la structure des connaissances au XVIIIème siècle</title> in <title rend="italic">Recherches sur Diderot et sur l'Encyclopédie</title>, 31-32, April 2002, 45-70.</bibl>
            <bibl xml:id="brian1998" label="Brian 1998"> Brian, Eric. <title rend="quotes">L'ancêtre de l'hypertexte,</title> in <title rend="italic">Les Cahiers de Science et Vie</title>, 47 (October 1998), 28-38.</bibl>
            <bibl xml:id="hand2001" label="Hand and Yu 2001"> Hand, David J., and Keming Yu. <title rend="quotes">Idiot's Bayes — Not So Stupid after All?</title>, International Statistical Review 69:3 (December 2001), 385-398.</bibl>
            <bibl xml:id="hoyt1965" label="Hoyt and Cassirer 1965"> Hoyt, Nelly, and Thomas Cassirer. <title rend="quotes">Introduction</title> to <title rend="italic">Encyclopedia: Selections by Diderot, D'Alembert, and a Society of Men of Letters</title>. Indianapolis: Bobbs-Merrill, 1965.</bibl>
            <bibl xml:id="leca-tsiomis1999" label="Leca-Tsiomis 1999"> Leca-Tsiomis, Marie. <title rend="italic">Ecrire l'Encyclopédie: Diderot: de l'usage des dictionnaires à la grammaire philosophique</title>. Oxford: Voltaire Foundation, 1999.</bibl>
            <bibl xml:id="lough1973" label="Lough 1973"> Lough, John. <title rend="italic">The Contributors to the <title rend="quotes">Encyclopédie</title>
               </title>. London: Grant and Cutler, 1973.</bibl>
            <bibl xml:id="morrissey2001" label="Morrissey et al. 2001"> Morrissey, Robert, Jack Iverson, and Mark Olsen. <title rend="quotes">Présentation: L'Encyclopédie Electronique</title> in Robert Morrissey and Philippe Roger, eds., <title rend="italic">L'Encyclopédie du réseau au livre et du livre au réseau</title>. Paris: Champion, 2001. 17-27.</bibl>
            <bibl xml:id="proust1995" label="Proust 1995">  Proust, Jacques. <title rend="italic">Diderot et l'Encyclopédie</title>. Paris: Albin Michel, 1995.</bibl>
            <bibl xml:id="schwab1971" label="Schwab et al. 1971-1984"> Schwab, Richard, Walter Rex, and John Lough. <title rend="italic">Inventory of Diderot's Encyclopédie</title>, 7 vols. Oxford: Studies on Voltaire and the Eighteenth Century, 1971-1984.</bibl>
            <bibl xml:id="witten2005" label="Witten and Frank 2005"> Witten, Ian, and Eibe Frank. <title rend="italic">Data Mining: Practical Machine Learning Tools and Techniques</title>, 2nd ed. San Francisco, CA:  Morgan Kaufmann, 2005, 94-96.</bibl>
         </listBibl>
      </back>
   </text>
</TEI>