<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="../schemas/TAN-key.rnc" type="application/relax-ng-compact-syntax"?>
<?xml-model href="../schemas/TAN-key.sch" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TAN-key xmlns="tag:textalign.net,2015:ns" TAN-version="1 dev"
    id="tag:textalign.net,2015:tan-key:tokenizations">
    <head>
        <name>TAN keywords for types of token definitions</name>
        <desc>Definitive list of key terms used to name standard token definitions.</desc>
        <master-location href="http://textalign.net/release/TAN-1-dev/TAN-key/token-definitions.TAN-key.xml"/>
        <rights-excluding-sources rights-holder="kalvesmaki">
            <IRI>http://creativecommons.org/licenses/by/4.0/deed.en_US</IRI>
            <name>Creative Commons Attribution 4.0 International License</name>
            <desc>This license is granted independent of rights and licenses associated with the
                source. </desc>
        </rights-excluding-sources>
        <declarations/>
        <agent xml:id="kalvesmaki" roles="creator">
            <IRI>http://viaf.org/viaf/299582703</IRI>
            <IRI>tag:textalign.net,2015:agent:kalvesmaki:joel</IRI>
            <name xml:lang="eng">Joel Kalvesmaki</name>
        </agent>
        <role xml:id="creator">
            <IRI>http://schema.org/creator</IRI>
            <name xml:lang="eng">creator</name>
        </role>
        <change when="2016-02-02" who="kalvesmaki">Started file</change>
        <change when="2016-02-22" who="kalvesmaki">Revised to suit new
            &lt;token-definition></change>
    </head>
    <body in-progress="false" affects-element="token-definition">
        <item>
            <token-definition regex="[\w&#xad;&#x200b;&#x200d;]+"/>
            <name>letters</name>
            <name>letters only</name>
            <name>general-words-only-1</name>
            <name>general-words-only</name>
            <name>gwo</name>
            <desc>General tokenization pattern for any language, words only. Non-letters such as
                punctuation are ignored.</desc>
        </item>
        <item>
            <token-definition regex="\w+|[^\w\s]"/>
            <name>letters and punctuation</name>
            <name>general-1</name>
            <name>general</name>
            <name>gen</name>
            <desc>General tokenization pattern for any language, treating not only series of letters
                as word tokens but also individual non-letter characters (e.g., punctuation).</desc>
        </item>
        <item>
            <token-definition regex="\S+"/>
            <name>nonspace</name>
            <desc>General tokenization pattern for any language, treating any contiguous run of
                nonspace marks as a word.</desc>
        </item>
    </body>
</TAN-key>
