Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump black from 19.10b0 to 20.8b1 #66

Merged
merged 5 commits into from Mar 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 7 additions & 7 deletions extra_model/_adjectives.py
Expand Up @@ -10,9 +10,9 @@

def cluster_adjectives(adjective_counts, vectorizer): # noqa: C901
"""Cluster adjectives based on a constant radius clustering algorithm.

Technical implementation uses a scikitlearn BallTree.

:param adjective_counts: dictionary with adjectives and their counts
:type adjective_counts: [(str,int)]
:param vectorizer: provide embeddings to evaluate adjective similarity
Expand Down Expand Up @@ -118,10 +118,10 @@ def cluster_adjectives(adjective_counts, vectorizer): # noqa: C901

def fill_sentiment_dict(adjective_counts):
"""Given a dictionary with adjectives and their counts, will compute.

The sentiment of each of the adjectives using the VADER sentiment analysis package
and return a dictionary of the adjectives and their sentiments.

:param adjective_counts: dictionary with adjectives and their counts
:type adjective_counts: dict
:return: dictionary, where the keys are the adjectives and the values are tuples of the
Expand All @@ -143,7 +143,7 @@ def fill_sentiment_dict(adjective_counts):

def sentiments_from_adjectives(adjective_counts, sentiment_dict):
"""Build the weighted average sentiment score from a list of adjetives and their counts.

:param adjective_counts: list of tuples with adjectives and their counts
:type adjective_counts: [(str,int)]
:param sentiment_dict: dictionary with adjectives and their sentiment, as tuple of compound and binary sentiment
Expand All @@ -170,11 +170,11 @@ def sentiments_from_adjectives(adjective_counts, sentiment_dict):

def adjective_info(dataframe_topics, dataframe_aspects, vectorizer):
"""Add adjective related information to the dataframes.

This has two facets:
-> for each topic cluster similar adjectives, to get a more abstract/readable list
-> for each topic, use the adjectives to come up with a sentiment classification

:param dataframe_topics: the dataframe with the topics we want to enrich, needs to have a collum `rawterms`
:type dataframe_topics: :class:`pandas.DataFrame`
:param dataframe_aspects: the dataframe with the aspect instances and related adjectives with columsn `aspect` and `descriptor`
Expand Down
17 changes: 9 additions & 8 deletions extra_model/_aspects.py
Expand Up @@ -15,7 +15,7 @@

def compound_noun_list(token):
"""Find compound nouns.

:param token: token for which to generate potential compound nouns
:type token: :class:`spacy.token`
:return: list of potential compounds
Expand All @@ -33,7 +33,7 @@ def compound_noun_list(token):

def acomp_list(tokens):
"""Find descriptions for a given token.

:param tokens: list of tokens that are children of the head of the nount for which descriptions are searched.
:type tokens: [:class:`spacy.token`]
:return: list of adjectives
Expand All @@ -55,7 +55,7 @@ def acomp_list(tokens):

def adjective_list(tokens):
"""Find adjectives modifying a given noun.

:param tokens: tokens of potential adjectice candidates (children of the noun and children of the head for compounds)
:type tokens: [:class:`spacy.token`]
:return: list of adjectives
Expand All @@ -77,7 +77,7 @@ def adjective_list(tokens):

def adjective_negations(token):
"""Find all negated adjectives in a sentence.

:param token: negation token to handle
:type token: :class:`spacy.token`
:return: list of negated adjectives
Expand Down Expand Up @@ -108,9 +108,9 @@ def adjective_negations(token):

def parse(dataframe_texts): # noqa: C901
"""Parse the comments and extract a list of potential aspects based on grammatical relations.

(e.g. modified by adjective)

:param dataframe_texts: a dataframe with the raw texts. The collumn wit the texts needs to be called 'Comments'
:type dataframe_texts: :class:`pandas.DataFrame`
:return: a dataframe with the aspect candidates
Expand All @@ -128,7 +128,8 @@ def parse(dataframe_texts): # noqa: C901
# n_threads > 5 can segfault with long (>500 tokens) sentences
# n_threads has been deprecated in spacy 3.x - https://spacy.io/usage/v2-1#incompat
for index, document in zip(
dataframe_texts.index, nlp.pipe(dataframe_texts.Comments, batch_size=500),
dataframe_texts.index,
nlp.pipe(dataframe_texts.Comments, batch_size=500),
): # TODO reduce for production/make configurable
negated_adjectives = []
for token in document:
Expand Down Expand Up @@ -171,7 +172,7 @@ def parse(dataframe_texts): # noqa: C901

def generate_aspects(dataframe_texts):
"""Generate the aspects that will be merged into topics from the raw texts.

:param dataframe_texts: a dataframe with the raw texts in the column 'Comments'
:type dataframe_texts: :class:`pandas.DataFrame`
:return: a dataframe with the aspect candidates, their associated description, index of original text in the
Expand Down
8 changes: 4 additions & 4 deletions extra_model/_disambiguate.py
Expand Up @@ -14,7 +14,7 @@

def vectorize_aspects(aspect_counts, vectorizer):
"""Turn the aspect map into a a vector of nouns and their vector representations, which also filters aspects without embedding.

:param aspect_counts: (dict): the dictionary with aspect counts
:param vectorizer: (Vectorizer): the provider of word-embeddings
:return vectors with representable aspects and their vector embeddings
Expand All @@ -32,7 +32,7 @@ def vectorize_aspects(aspect_counts, vectorizer):
def best_cluster(aspect_vectors):
"""
Find the optimal cluster size using silhouette scores.

:param aspect_vectors: list of embeddings vectors to be clustered
:type aspect_vectors: [:class:`numpy.array`]
:return: the optimal number of clusters
Expand Down Expand Up @@ -73,7 +73,7 @@ def best_cluster(aspect_vectors):

def cluster(aspects, aspect_vectors, vectorizer):
"""Cluster aspects based on the distance of their vector representations.

Once clusters are found, use the other aspects in a given cluster to generate the
context for a specific aspect noun

Expand Down Expand Up @@ -115,7 +115,7 @@ def cluster(aspects, aspect_vectors, vectorizer):

def match(aspect_counts, vectorizer):
"""Match a word to a specific wordnet entry, using the vector similarity of the aspects context and the synonym gloss.

:param aspect_counts: Counter object of aspect->number of occurrence
:type aspect_counts: :class:`collections.Counter`
:param vectorizer: the provider of word-embeddings for context generation
Expand Down
2 changes: 1 addition & 1 deletion extra_model/_filter.py
Expand Up @@ -8,7 +8,7 @@

def filter(dataframe):
"""Filter a dataframe for language and text length.

The following rules apply:
1. Only comments with at least 20 characters retained.
2. Only comments in English are retained.
Expand Down
8 changes: 4 additions & 4 deletions extra_model/_summarize.py
Expand Up @@ -6,7 +6,7 @@

def qa(dataframe_texts, dataframe_aspects, dataframe_topics):
"""Print summary information.

:param dataframe_texts: dataframe with the raw texts (for example output)
:type dataframe_texts: :class:`pandas.DataFrame`
:param dataframe_aspects: dataframe with the aspects
Expand Down Expand Up @@ -75,7 +75,7 @@ def qa(dataframe_texts, dataframe_aspects, dataframe_topics):

def set_aspect(topic, dataframe_aspects):
"""For a given topic, set topic and adjective cluster fields in the aspect_dataframe.

:param topic: the topic and it's associated information that we need to copy to the relevant entries in the aspect frame
:type topic: :class:`pandas.DataFrame.Row`
:param dataframe_aspects: the dataframe to be enriched with topic information
Expand All @@ -99,7 +99,7 @@ def set_aspect(topic, dataframe_aspects):

def link_aspects_to_topics(dataframe_aspects, dataframe_topics):
"""Fill topic and adjective cluster information into the aspect dataframe.

:param dataframe_aspects: the dataframe to be enriched
:type dataframe_aspects: :class:`pandas.DataFrame`
:param dataframe_topics: the dataframe that has the topic and adjective cluster information
Expand All @@ -122,7 +122,7 @@ def link_aspects_to_topics(dataframe_aspects, dataframe_topics):

def link_aspects_to_texts(dataframe_aspects, dataframe_texts):
"""Transfer the original text identifier from the original text data table into the final aspect table.

:param dataframe_aspects: table to be enriched
:type dataframe_aspects: :class:`pandas.DataFrame`
:param dataframe_texts: original table from which this information is extracted
Expand Down
18 changes: 9 additions & 9 deletions extra_model/_topics.py
Expand Up @@ -17,7 +17,7 @@

def path_to_graph(hypernym_list, initialnoun):
"""Make a hypernym chain into a graph.

:param hypernym_list: list of hypernyms for a word as obtained from wordnet
:type hypernym_list: [str]
:param initialnoun: the initial noun (we need this to mark it as leaf in the tree)
Expand All @@ -39,7 +39,7 @@ def path_to_graph(hypernym_list, initialnoun):

def get_nodevec(node, vectors):
"""Get the vector representation of a gloss a wordnet node.

Used to evaluate similarity between rungs in the hypernym chain.
:param node: the wornet node for which to compute the embedding
:type node: str
Expand All @@ -58,7 +58,7 @@ def get_nodevec(node, vectors):

def iterate(transition_matrix, importance, original, alpha):
"""Find the stable importance vector by iterated multiplication with the distance matrix.

This function does a simple iteration. The "jump-back" probability from the paper
is implemented as a linear superposition of the new and original importance numbers.
:param transition_matrix: The connectedness matrix of the graph, including similarity weights.
Expand All @@ -83,7 +83,7 @@ def iterate(transition_matrix, importance, original, alpha):

def aggregate(aspects, aspect_counts, synsets_match, vectors): # noqa: C901
"""Aggregate the aspects by building a tree from the hypernym chains.

Using a page-rank type algorithm to assign importance to the nodes in the graph
we only consider wordnet entries for this, not the actual aspects extracted from the texts.
:param aspects: List of aspects to analyze
Expand Down Expand Up @@ -210,7 +210,7 @@ def traverse_tree( # noqa: C901
node_list, associated_aspects, aspect_counts, full_tree, weighted, direction
):
"""Find all hypernyms/hyponyms in the tree to a given node.

Aggregate the number of associated mentions in the original texts, optionally
weighted by term-similarity.
:param nodelist: List of nodes from which to gather the subsidiary terms and their initial mentions
Expand Down Expand Up @@ -267,7 +267,7 @@ def traverse_tree( # noqa: C901

def collect_topic_info(filtered_topics, removed_topics, aspect_counts, full_tree):
"""Gather various bits of information into a single DataFrame.

For each topic we store the importance, the list of associated raw text terms and their numbers.
:param filtered_topics: List of topics remaining after filtering out low-iimportance subsidiary topics
:type filtered_topics: [str]
Expand Down Expand Up @@ -350,7 +350,7 @@ def collect_topic_info(filtered_topics, removed_topics, aspect_counts, full_tree

def has_connection(term, prior, full_tree):
"""Check if two terms are connected within the directed hyopernym graph.

:param term: first node to test
:type term: str
:param prior: second node to test
Expand All @@ -369,7 +369,7 @@ def has_connection(term, prior, full_tree):

def filter_aggregates(topics, tree):
"""Filter the importance-sorted list, so that each remaining topic is the sole member of its hypernym chain.

:param topics: List of all topics in the graph
:type topics: [str]
:param tree: the graph which is being traversed
Expand Down Expand Up @@ -398,7 +398,7 @@ def filter_aggregates(topics, tree):

def get_topics(dataframe_aspects, vectors):
"""Generate the semantically clustered topics from the raw aspects.

:param dataframe_aspects: the collection of nouns to be aggregated into topics
:type dataframe_aspects: :class:`pandas.DataFrame`
:param vectors: provides embeddings for context clustering and wordsense disammbguation
Expand Down
4 changes: 2 additions & 2 deletions extra_model/_vectorizer.py
Expand Up @@ -12,7 +12,7 @@ class Vectorizer:
def __init__(self, embedding_file):
"""
Use the generic gensim vector embedding lookup.

Currently using pretrained glove embeddings, but anything goes.
:param embedding_file: pathname for the file that stores the word-embeddings in gensim keyed-vectors format
:type str
Expand All @@ -25,7 +25,7 @@ def __init__(self, embedding_file):
def get_vector(self, key):
"""
Return the vector embedding for a given word.

According to the following logic:
- if no embedding is found for this word, check if it's a compound
- if it's a compound try to take the average embedding of the constituent words
Expand Down
2 changes: 1 addition & 1 deletion requirements-test.txt
@@ -1,5 +1,5 @@
bandit==1.7.0
black==19.10b0
black==20.8b1
flake8==3.9.0
isort==5.7.0
mypy==0.812
Expand Down
57 changes: 33 additions & 24 deletions tests/test_topics.py
Expand Up @@ -138,25 +138,31 @@ def test__aggregate(vec):


def test__traverse_tree__down_weighted(simple_graph):
assert traverse_tree(
[("R", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=True,
direction="down",
) == {"L1": 2, "L2": 0.5}
assert (
traverse_tree(
[("R", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=True,
direction="down",
)
== {"L1": 2, "L2": 0.5}
)


def test__traverse_tree__down_unweighted(simple_graph):
assert traverse_tree(
[("R", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=False,
direction="down",
) == {"L1": 4, "L2": 1}
assert (
traverse_tree(
[("R", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=False,
direction="down",
)
== {"L1": 4, "L2": 1}
)


def test__traverse_tree__up_weighted(simple_graph):
Expand All @@ -166,14 +172,17 @@ def test__traverse_tree__up_weighted(simple_graph):


def test__traverse_tree__up_unweighted(simple_graph):
assert traverse_tree(
[("I1", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=False,
direction="up",
) == {"L1": 4}
assert (
traverse_tree(
[("I1", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=False,
direction="up",
)
== {"L1": 4}
)


@pytest.mark.skip(
Expand Down