Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use max(uint64) for OOV lexeme rank #5303

Merged
merged 6 commits into from
Apr 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion spacy/_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def link_vectors_to_models(vocab):
if word.orth in vectors.key2row:
word.rank = vectors.key2row[word.orth]
else:
word.rank = 0
word.rank = util.OOV_RANK
data = ops.asarray(vectors.data)
# Set an entry here, so that vectors are accessed by StaticVectors
# (unideal, I know)
Expand Down
4 changes: 2 additions & 2 deletions spacy/cli/init_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning
from ..util import ensure_path, get_lang_class
from ..util import ensure_path, get_lang_class, OOV_RANK

try:
import ftfy
Expand Down Expand Up @@ -148,7 +148,7 @@ def create_model(lang, lex_attrs, name=None):
lang_class = get_lang_class(lang)
nlp = lang_class()
for lexeme in nlp.vocab:
lexeme.rank = 0
lexeme.rank = OOV_RANK
lex_added = 0
for attrs in lex_attrs:
if "settings" in attrs:
Expand Down
1 change: 1 addition & 0 deletions spacy/lexeme.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ from numpy cimport ndarray


cdef LexemeC EMPTY_LEXEME
cdef attr_t OOV_RANK

cdef class Lexeme:
cdef LexemeC* c
Expand Down
3 changes: 3 additions & 0 deletions spacy/lexeme.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ np.import_array()
import numpy
from thinc.neural.util import get_array_module

from libc.stdint cimport UINT64_MAX
from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
Expand All @@ -21,7 +22,9 @@ from .attrs import intify_attrs
from .errors import Errors, Warnings, user_warning


OOV_RANK = UINT64_MAX
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.id = OOV_RANK


cdef class Lexeme:
Expand Down
9 changes: 9 additions & 0 deletions spacy/tests/vocab_vectors/test_lexeme.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from __future__ import unicode_literals

import pytest
import numpy
from spacy.attrs import IS_ALPHA, IS_DIGIT
from spacy.util import OOV_RANK


@pytest.mark.parametrize("text1,prob1,text2,prob2", [("NOUN", -1, "opera", -2)])
Expand Down Expand Up @@ -69,3 +71,10 @@ def test_lexeme_bytes_roundtrip(en_vocab):
assert one.orth == alpha.orth
assert one.lower == alpha.lower
assert one.lower_ == alpha.lower_


def test_vocab_lexeme_oov_rank(en_vocab):
"""Test that default rank is OOV_RANK."""
lex = en_vocab["word"]
assert OOV_RANK == numpy.iinfo(numpy.uint64).max
assert lex.rank == OOV_RANK
2 changes: 2 additions & 0 deletions spacy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import functools
import itertools
import numpy.random
import numpy
import srsly
import catalogue
import sys
Expand All @@ -34,6 +35,7 @@

_data_path = Path(__file__).parent / "data"
_PRINT_ENV = False
OOV_RANK = numpy.iinfo(numpy.uint64).max


class registry(object):
Expand Down
6 changes: 3 additions & 3 deletions spacy/vocab.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import srsly
from collections import OrderedDict
from thinc.neural.util import get_array_module

from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport EMPTY_LEXEME, OOV_RANK
from .lexeme cimport Lexeme
from .typedefs cimport attr_t
from .tokens.token cimport Token
Expand Down Expand Up @@ -165,9 +165,9 @@ cdef class Vocab:
lex.orth = self.strings.add(string)
lex.length = len(string)
if self.vectors is not None:
lex.id = self.vectors.key2row.get(lex.orth, 0)
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
else:
lex.id = 0
lex.id = OOV_RANK
if self.lex_attr_getters is not None:
for attr, func in self.lex_attr_getters.items():
value = func(string)
Expand Down