testing turtle prefix names where reference starts with number (#1590)

* testing turtle prefix names where reference starts with number * remove case insensitive flag from Turtle lexer * use same end-of-string regex as in SPARQL and ShExC * make example.ttl valid turtle
pygments · Dec 5, 2020 · 95935cc · 95935cc
1 parent 97457ad
commit 95935cc
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 17 deletions.
diff --git a/pygments/lexers/rdf.py b/pygments/lexers/rdf.py
@@ -187,19 +187,61 @@ class TurtleLexer(RegexLexer):
     filenames = ['*.ttl']
     mimetypes = ['text/turtle', 'application/x-turtle']
 
-    flags = re.IGNORECASE
+    # character group definitions ::
+    PN_CHARS_BASE_GRP = ('a-zA-Z'
+                         '\u00c0-\u00d6'
+                         '\u00d8-\u00f6'
+                         '\u00f8-\u02ff'
+                         '\u0370-\u037d'
+                         '\u037f-\u1fff'
+                         '\u200c-\u200d'
+                         '\u2070-\u218f'
+                         '\u2c00-\u2fef'
+                         '\u3001-\ud7ff'
+                         '\uf900-\ufdcf'
+                         '\ufdf0-\ufffd')
+
+    PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
+
+    PN_CHARS_GRP = (PN_CHARS_U_GRP +
+                    r'\-' +
+                    r'0-9' +
+                    '\u00b7' +
+                    '\u0300-\u036f' +
+                    '\u203f-\u2040')
+
+    PN_CHARS = '[' + PN_CHARS_GRP + ']'
+
+    PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
+
+    PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
+
+    HEX_GRP = '0-9A-Fa-f'
+
+    HEX = '[' + HEX_GRP + ']'
+
+    PERCENT = '%' + HEX + HEX
+
+    PN_LOCAL_ESC_CHARS_GRP = r' _~.\-!$&"()*+,;=/?#@%'
+
+    PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
+
+    PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
+
+    PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
+
+    PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
+                '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
+                PN_CHARS_GRP + ':]|' + PLX + '))?')
 
     patterns = {
-        'PNAME_NS': r'((?:[a-z][\w-]*)?\:)',  # Simplified character range
+        'PNAME_NS': r'((?:[a-zA-Z][\w-]*)?\:)',  # Simplified character range
         'IRIREF': r'(<[^<>"{}|^`\\\x00-\x20]*>)'
     }
 
-    # PNAME_NS PN_LOCAL (with simplified character range)
-    patterns['PrefixedName'] = r'%(PNAME_NS)s([a-z][\w-]*)' % patterns
-
     tokens = {
         'root': [
-            (r'\s+', Whitespace),
+            (r'\s+', Text),
 
             # Base / prefix
             (r'(@base|BASE)(\s+)%(IRIREF)s(\s*)(\.?)' % patterns,
@@ -216,8 +258,8 @@ class TurtleLexer(RegexLexer):
             (r'%(IRIREF)s' % patterns, Name.Variable),
 
             # PrefixedName
-            (r'%(PrefixedName)s' % patterns,
-             bygroups(Name.Namespace, Name.Tag)),
+            (r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + r')?',
+             bygroups(Name.Namespace, Punctuation, Name.Tag)),
 
             # Comment
             (r'#[^\n]+', Comment),
@@ -257,12 +299,10 @@ class TurtleLexer(RegexLexer):
             (r'.', String, '#pop'),
         ],
         'end-of-string': [
-            (r'(@)([a-z]+(:?-[a-z0-9]+)*)',
+            (r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
              bygroups(Operator, Generic.Emph), '#pop:2'),
 
             (r'(\^\^)%(IRIREF)s' % patterns, bygroups(Operator, Generic.Emph), '#pop:2'),
-            (r'(\^\^)%(PrefixedName)s' % patterns,
-             bygroups(Operator, Generic.Emph, Generic.Emph), '#pop:2'),
 
             default('#pop:2'),
 

diff --git a/tests/examplefiles/example.ttl b/tests/examplefiles/example.ttl
@@ -2,14 +2,14 @@
 @prefix dcterms: <http://purl.org/dc/terms/>. @prefix xs: <http://www.w3.org/2001/XMLSchema> .
 @prefix mads: <http://www.loc.gov/mads/rdf/v1#> .
 @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
-@PREFIX dc: <http://purl.org/dc/elements/1.1/>  # SPARQL-like syntax is OK
+PREFIX dc: <http://purl.org/dc/elements/1.1/>  # SPARQL-like syntax is OK
 @prefix : <http://xmlns.com/foaf/0.1/> .  # empty prefix is OK
 
 <http://example.org/#spiderman> <http://www.perceive.net/schemas/relationship/enemyOf> <http://example.org/#green-goblin> .
 
-<#doc1> a <#document>
+<#doc1> a <#document>;
 	dc:creator "Smith", "Jones"; 
-	:knows <http://getopenid.com/jsmith>
+	:knows <http://getopenid.com/jsmith>;
 	dcterms:hasPart [ # A comment
 		dc:title "Some title", "Some other title";
 		dc:creator "برشت، برتولد"@ar;
@@ -23,8 +23,8 @@
 
 <http://data.ub.uio.no/realfagstermer/006839> a mads:Topic,
     skos:Concept ;
-    dcterms:created "2014-08-25"^^xsd:date ;
-    dcterms:modified "2014-11-12"^^xsd:date ;
+    dcterms:created "2014-08-25"^^xs:date ;
+    dcterms:modified "2014-11-12"^^xs:date ;
     dcterms:identifier "REAL006839" ;
     skos:prefLabel "Flerbørstemarker"@nb,
         "Polychaeta"@la ;
@@ -33,7 +33,7 @@
         "Mangebørsteormer"@nb,
         "Havbørsteormer"@nb,
         "Havbørstemarker"@nb,
-        "Polycheter"@nb.
+        "Polycheter"@nb ;
     skos:inScheme <http://data.ub.uio.no/realfagstermer/> ;
     skos:narrower <http://data.ub.uio.no/realfagstermer/018529>,
         <http://data.ub.uio.no/realfagstermer/024538>,

diff --git a/tests/test_rdf.py b/tests/test_rdf.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+"""
+    Basic RubyLexer Test
+    ~~~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import pytest
+
+from pygments.token import Name, Punctuation, Text
+from pygments.lexers import TurtleLexer, ShExCLexer
+
+
+@pytest.fixture(scope='module')
+def turtle_lexer():
+    yield TurtleLexer()
+
+@pytest.fixture(scope='module')
+def shexc_lexer():
+    yield ShExCLexer()
+
+def test_turtle_prefixed_name_starting_with_number(turtle_lexer):
+    fragment = 'alice:6f6e4241-75a2-4780-9b2a-40da53082e54\n'
+    tokens = [
+        (Name.Namespace, 'alice'),
+        (Punctuation, ':'),
+        (Name.Tag, '6f6e4241-75a2-4780-9b2a-40da53082e54'),
+        (Text, '\n'),
+    ]
+    assert list(turtle_lexer.get_tokens(fragment)) == tokens
+
+def test_shexc_prefixed_name_starting_with_number(shexc_lexer):
+    fragment = 'alice:6f6e4241-75a2-4780-9b2a-40da53082e54\n'
+    tokens = [
+        (Name.Namespace, 'alice'),
+        (Punctuation, ':'),
+        (Name.Tag, '6f6e4241-75a2-4780-9b2a-40da53082e54'),
+        (Text, '\n'),
+    ]
+    assert list(shexc_lexer.get_tokens(fragment)) == tokens