Skip to content

Commit

Permalink
testing turtle prefix names where reference starts with number (#1590)
Browse files Browse the repository at this point in the history
* testing turtle prefix names where reference starts with number

* remove case insensitive flag from Turtle lexer

* use same end-of-string regex as in SPARQL and ShExC

* make example.ttl valid turtle
  • Loading branch information
elf-pavlik committed Dec 5, 2020
1 parent 97457ad commit 95935cc
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 17 deletions.
62 changes: 51 additions & 11 deletions pygments/lexers/rdf.py
Expand Up @@ -187,19 +187,61 @@ class TurtleLexer(RegexLexer):
filenames = ['*.ttl']
mimetypes = ['text/turtle', 'application/x-turtle']

flags = re.IGNORECASE
# character group definitions ::
PN_CHARS_BASE_GRP = ('a-zA-Z'
'\u00c0-\u00d6'
'\u00d8-\u00f6'
'\u00f8-\u02ff'
'\u0370-\u037d'
'\u037f-\u1fff'
'\u200c-\u200d'
'\u2070-\u218f'
'\u2c00-\u2fef'
'\u3001-\ud7ff'
'\uf900-\ufdcf'
'\ufdf0-\ufffd')

PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')

PN_CHARS_GRP = (PN_CHARS_U_GRP +
r'\-' +
r'0-9' +
'\u00b7' +
'\u0300-\u036f' +
'\u203f-\u2040')

PN_CHARS = '[' + PN_CHARS_GRP + ']'

PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'

PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'

HEX_GRP = '0-9A-Fa-f'

HEX = '[' + HEX_GRP + ']'

PERCENT = '%' + HEX + HEX

PN_LOCAL_ESC_CHARS_GRP = r' _~.\-!$&"()*+,;=/?#@%'

PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'

PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS

PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'

PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
'(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
PN_CHARS_GRP + ':]|' + PLX + '))?')

patterns = {
'PNAME_NS': r'((?:[a-z][\w-]*)?\:)', # Simplified character range
'PNAME_NS': r'((?:[a-zA-Z][\w-]*)?\:)', # Simplified character range
'IRIREF': r'(<[^<>"{}|^`\\\x00-\x20]*>)'
}

# PNAME_NS PN_LOCAL (with simplified character range)
patterns['PrefixedName'] = r'%(PNAME_NS)s([a-z][\w-]*)' % patterns

tokens = {
'root': [
(r'\s+', Whitespace),
(r'\s+', Text),

# Base / prefix
(r'(@base|BASE)(\s+)%(IRIREF)s(\s*)(\.?)' % patterns,
Expand All @@ -216,8 +258,8 @@ class TurtleLexer(RegexLexer):
(r'%(IRIREF)s' % patterns, Name.Variable),

# PrefixedName
(r'%(PrefixedName)s' % patterns,
bygroups(Name.Namespace, Name.Tag)),
(r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + r')?',
bygroups(Name.Namespace, Punctuation, Name.Tag)),

# Comment
(r'#[^\n]+', Comment),
Expand Down Expand Up @@ -257,12 +299,10 @@ class TurtleLexer(RegexLexer):
(r'.', String, '#pop'),
],
'end-of-string': [
(r'(@)([a-z]+(:?-[a-z0-9]+)*)',
(r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
bygroups(Operator, Generic.Emph), '#pop:2'),

(r'(\^\^)%(IRIREF)s' % patterns, bygroups(Operator, Generic.Emph), '#pop:2'),
(r'(\^\^)%(PrefixedName)s' % patterns,
bygroups(Operator, Generic.Emph, Generic.Emph), '#pop:2'),

default('#pop:2'),

Expand Down
12 changes: 6 additions & 6 deletions tests/examplefiles/example.ttl
Expand Up @@ -2,14 +2,14 @@
@prefix dcterms: <http://purl.org/dc/terms/>. @prefix xs: <http://www.w3.org/2001/XMLSchema> .
@prefix mads: <http://www.loc.gov/mads/rdf/v1#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@PREFIX dc: <http://purl.org/dc/elements/1.1/> # SPARQL-like syntax is OK
PREFIX dc: <http://purl.org/dc/elements/1.1/> # SPARQL-like syntax is OK
@prefix : <http://xmlns.com/foaf/0.1/> . # empty prefix is OK

<http://example.org/#spiderman> <http://www.perceive.net/schemas/relationship/enemyOf> <http://example.org/#green-goblin> .

<#doc1> a <#document>
<#doc1> a <#document>;
dc:creator "Smith", "Jones";
:knows <http://getopenid.com/jsmith>
:knows <http://getopenid.com/jsmith>;
dcterms:hasPart [ # A comment
dc:title "Some title", "Some other title";
dc:creator "برشت، برتولد"@ar;
Expand All @@ -23,8 +23,8 @@

<http://data.ub.uio.no/realfagstermer/006839> a mads:Topic,
skos:Concept ;
dcterms:created "2014-08-25"^^xsd:date ;
dcterms:modified "2014-11-12"^^xsd:date ;
dcterms:created "2014-08-25"^^xs:date ;
dcterms:modified "2014-11-12"^^xs:date ;
dcterms:identifier "REAL006839" ;
skos:prefLabel "Flerbørstemarker"@nb,
"Polychaeta"@la ;
Expand All @@ -33,7 +33,7 @@
"Mangebørsteormer"@nb,
"Havbørsteormer"@nb,
"Havbørstemarker"@nb,
"Polycheter"@nb.
"Polycheter"@nb ;
skos:inScheme <http://data.ub.uio.no/realfagstermer/> ;
skos:narrower <http://data.ub.uio.no/realfagstermer/018529>,
<http://data.ub.uio.no/realfagstermer/024538>,
Expand Down
42 changes: 42 additions & 0 deletions tests/test_rdf.py
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
"""
Basic RubyLexer Test
~~~~~~~~~~~~~~~~~~~~
:copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""

import pytest

from pygments.token import Name, Punctuation, Text
from pygments.lexers import TurtleLexer, ShExCLexer


@pytest.fixture(scope='module')
def turtle_lexer():
yield TurtleLexer()

@pytest.fixture(scope='module')
def shexc_lexer():
yield ShExCLexer()

def test_turtle_prefixed_name_starting_with_number(turtle_lexer):
fragment = 'alice:6f6e4241-75a2-4780-9b2a-40da53082e54\n'
tokens = [
(Name.Namespace, 'alice'),
(Punctuation, ':'),
(Name.Tag, '6f6e4241-75a2-4780-9b2a-40da53082e54'),
(Text, '\n'),
]
assert list(turtle_lexer.get_tokens(fragment)) == tokens

def test_shexc_prefixed_name_starting_with_number(shexc_lexer):
fragment = 'alice:6f6e4241-75a2-4780-9b2a-40da53082e54\n'
tokens = [
(Name.Namespace, 'alice'),
(Punctuation, ':'),
(Name.Tag, '6f6e4241-75a2-4780-9b2a-40da53082e54'),
(Text, '\n'),
]
assert list(shexc_lexer.get_tokens(fragment)) == tokens

0 comments on commit 95935cc

Please sign in to comment.