wayfair-incubator · romatik · Mar 17, 2021 · Mar 17, 2021 · Mar 17, 2021 · Mar 17, 2021
diff --git a/extra_model/_adjectives.py b/extra_model/_adjectives.py
@@ -10,9 +10,9 @@
 
 def cluster_adjectives(adjective_counts, vectorizer):  # noqa: C901
     """Cluster adjectives based on a constant radius clustering algorithm.
-    
+
     Technical implementation uses a scikitlearn BallTree.
-    
+
     :param adjective_counts: dictionary with adjectives and their counts
     :type adjective_counts: [(str,int)]
     :param vectorizer:  provide embeddings to evaluate adjective similarity
@@ -118,10 +118,10 @@ def cluster_adjectives(adjective_counts, vectorizer):  # noqa: C901
 
 def fill_sentiment_dict(adjective_counts):
     """Given a dictionary with adjectives and their counts, will compute.
-    
+
     The sentiment of each of the adjectives using the VADER sentiment analysis package
     and return a dictionary of the adjectives and their sentiments.
-    
+
     :param adjective_counts: dictionary with adjectives and their counts
     :type adjective_counts: dict
     :return: dictionary, where the keys are the adjectives and the values are tuples of the
@@ -143,7 +143,7 @@ def fill_sentiment_dict(adjective_counts):
 
 def sentiments_from_adjectives(adjective_counts, sentiment_dict):
     """Build the weighted average sentiment score from a list of adjetives and their counts.
-    
+
     :param adjective_counts: list of tuples with adjectives and their counts
     :type adjective_counts: [(str,int)]
     :param sentiment_dict: dictionary with adjectives and their sentiment, as tuple of compound and binary sentiment
@@ -170,11 +170,11 @@ def sentiments_from_adjectives(adjective_counts, sentiment_dict):
 
 def adjective_info(dataframe_topics, dataframe_aspects, vectorizer):
     """Add adjective related information to the dataframes.
-    
+
     This has two facets:
     -> for each topic cluster similar adjectives, to get a more abstract/readable list
     -> for each topic, use the adjectives to come up with a sentiment classification
-    
+
     :param dataframe_topics: the dataframe with the topics we want to enrich, needs to have a collum `rawterms`
     :type dataframe_topics: :class:`pandas.DataFrame`
     :param dataframe_aspects: the dataframe with the aspect instances and related adjectives with columsn `aspect` and `descriptor`

diff --git a/extra_model/_aspects.py b/extra_model/_aspects.py
@@ -15,7 +15,7 @@
 
 def compound_noun_list(token):
     """Find compound nouns.
-    
+
     :param token: token for which to generate potential compound nouns
     :type token: :class:`spacy.token`
     :return: list of potential compounds
@@ -33,7 +33,7 @@ def compound_noun_list(token):
 
 def acomp_list(tokens):
     """Find descriptions for a given token.
-    
+
     :param tokens: list of tokens that are children of the head of the nount for which descriptions are searched.
     :type tokens: [:class:`spacy.token`]
     :return: list of adjectives
@@ -55,7 +55,7 @@ def acomp_list(tokens):
 
 def adjective_list(tokens):
     """Find adjectives modifying a given noun.
-    
+
     :param tokens: tokens of potential adjectice candidates (children of the noun and children of the head for compounds)
     :type tokens: [:class:`spacy.token`]
     :return: list of adjectives
@@ -77,7 +77,7 @@ def adjective_list(tokens):
 
 def adjective_negations(token):
     """Find all negated adjectives in a sentence.
-    
+
     :param token: negation token to handle
     :type token: :class:`spacy.token`
     :return: list of negated adjectives
@@ -108,9 +108,9 @@ def adjective_negations(token):
 
 def parse(dataframe_texts):  # noqa: C901
     """Parse the comments and extract a list of potential aspects based on grammatical relations.
-    
+
     (e.g. modified by adjective)
-    
+
     :param dataframe_texts: a dataframe with the raw texts. The collumn wit the texts needs to be called 'Comments'
     :type dataframe_texts: :class:`pandas.DataFrame`
     :return: a dataframe with the aspect candidates
@@ -128,7 +128,8 @@ def parse(dataframe_texts):  # noqa: C901
     # n_threads > 5 can segfault with long (>500 tokens) sentences
     # n_threads has been deprecated in spacy 3.x - https://spacy.io/usage/v2-1#incompat
     for index, document in zip(
-        dataframe_texts.index, nlp.pipe(dataframe_texts.Comments, batch_size=500),
+        dataframe_texts.index,
+        nlp.pipe(dataframe_texts.Comments, batch_size=500),
     ):  # TODO reduce for production/make configurable
         negated_adjectives = []
         for token in document:
@@ -171,7 +172,7 @@ def parse(dataframe_texts):  # noqa: C901
 
 def generate_aspects(dataframe_texts):
     """Generate the aspects that will be merged into topics from the raw texts.
-    
+
     :param dataframe_texts: a dataframe with the raw texts in the column 'Comments'
     :type dataframe_texts: :class:`pandas.DataFrame`
     :return: a dataframe with the aspect candidates, their associated description, index of original text in the

diff --git a/extra_model/_disambiguate.py b/extra_model/_disambiguate.py
@@ -14,7 +14,7 @@
 
 def vectorize_aspects(aspect_counts, vectorizer):
     """Turn the aspect map into a a vector of nouns and their vector representations, which also filters aspects without embedding.
-    
+
     :param aspect_counts: (dict): the dictionary with aspect counts
     :param vectorizer: (Vectorizer): the provider of word-embeddings
     :return vectors with representable aspects and their vector embeddings
@@ -32,7 +32,7 @@ def vectorize_aspects(aspect_counts, vectorizer):
 def best_cluster(aspect_vectors):
     """
     Find the optimal cluster size using silhouette scores.
-    
+
     :param aspect_vectors:  list of embeddings vectors to be clustered
     :type aspect_vectors: [:class:`numpy.array`]
     :return: the optimal number of clusters
@@ -73,7 +73,7 @@ def best_cluster(aspect_vectors):
 
 def cluster(aspects, aspect_vectors, vectorizer):
     """Cluster aspects based on the distance of their vector representations.
-    
+
     Once clusters are found, use the other aspects in a given cluster to generate the
     context for a specific aspect noun
 
@@ -115,7 +115,7 @@ def cluster(aspects, aspect_vectors, vectorizer):
 
 def match(aspect_counts, vectorizer):
     """Match a word to a specific wordnet entry, using the vector similarity of the aspects context and the synonym gloss.
-    
+
     :param aspect_counts: Counter object of aspect->number of occurrence
     :type aspect_counts: :class:`collections.Counter`
     :param vectorizer:  the provider of word-embeddings for context generation

diff --git a/extra_model/_filter.py b/extra_model/_filter.py
@@ -8,7 +8,7 @@
 
 def filter(dataframe):
     """Filter a dataframe for language and text length.
-    
+
     The following rules apply:
     1. Only comments with at least 20 characters retained.
     2. Only comments in English are retained.

diff --git a/extra_model/_summarize.py b/extra_model/_summarize.py
@@ -6,7 +6,7 @@
 
 def qa(dataframe_texts, dataframe_aspects, dataframe_topics):
     """Print summary information.
-    
+
     :param dataframe_texts: dataframe with the raw texts (for example output)
     :type dataframe_texts: :class:`pandas.DataFrame`
     :param dataframe_aspects:  dataframe with the aspects
@@ -75,7 +75,7 @@ def qa(dataframe_texts, dataframe_aspects, dataframe_topics):
 
 def set_aspect(topic, dataframe_aspects):
     """For a given topic, set topic and adjective cluster fields in the aspect_dataframe.
-    
+
     :param topic: the topic and it's associated information that we need to copy to the relevant entries in the aspect frame
     :type topic: :class:`pandas.DataFrame.Row`
     :param dataframe_aspects: the dataframe to be enriched with topic information
@@ -99,7 +99,7 @@ def set_aspect(topic, dataframe_aspects):
 
 def link_aspects_to_topics(dataframe_aspects, dataframe_topics):
     """Fill topic and adjective cluster information into the aspect dataframe.
-    
+
     :param dataframe_aspects: the dataframe to be enriched
     :type dataframe_aspects: :class:`pandas.DataFrame`
     :param dataframe_topics:  the dataframe that has the topic and adjective cluster information
@@ -122,7 +122,7 @@ def link_aspects_to_topics(dataframe_aspects, dataframe_topics):
 
 def link_aspects_to_texts(dataframe_aspects, dataframe_texts):
     """Transfer the original text identifier from the original text data table into the final aspect table.
-    
+
     :param dataframe_aspects: table to be enriched
     :type dataframe_aspects: :class:`pandas.DataFrame`
     :param dataframe_texts: original table from which this information is extracted

diff --git a/extra_model/_topics.py b/extra_model/_topics.py
@@ -17,7 +17,7 @@
 
 def path_to_graph(hypernym_list, initialnoun):
     """Make a hypernym chain into a graph.
-    
+
     :param hypernym_list: list of hypernyms for a word as obtained from wordnet
     :type hypernym_list: [str]
     :param initialnoun: the initial noun (we need this to mark it as leaf in the tree)
@@ -39,7 +39,7 @@ def path_to_graph(hypernym_list, initialnoun):
 
 def get_nodevec(node, vectors):
     """Get the vector representation of a gloss a wordnet node.
-    
+
     Used to evaluate similarity between rungs in the hypernym chain.
     :param node: the wornet node for which to compute the embedding
     :type node: str
@@ -58,7 +58,7 @@ def get_nodevec(node, vectors):
 
 def iterate(transition_matrix, importance, original, alpha):
     """Find the stable importance vector by iterated multiplication with the distance matrix.
-    
+
     This function does a simple iteration. The "jump-back" probability from the paper
     is implemented as a linear superposition of the new and original importance numbers.
     :param transition_matrix: The connectedness matrix of the graph, including similarity weights.
@@ -83,7 +83,7 @@ def iterate(transition_matrix, importance, original, alpha):
 
 def aggregate(aspects, aspect_counts, synsets_match, vectors):  # noqa: C901
     """Aggregate the aspects by building a tree from the hypernym chains.
-    
+
     Using a page-rank type algorithm to assign importance to the nodes in the graph
     we only consider wordnet entries for this, not the actual aspects extracted from the texts.
     :param aspects: List of aspects to analyze
@@ -210,7 +210,7 @@ def traverse_tree(  # noqa: C901
     node_list, associated_aspects, aspect_counts, full_tree, weighted, direction
 ):
     """Find all hypernyms/hyponyms in the tree to a given node.
-    
+
     Aggregate the number of associated mentions in the original texts, optionally
     weighted by term-similarity.
     :param nodelist: List of nodes from which to gather the subsidiary terms and their initial mentions
@@ -267,7 +267,7 @@ def traverse_tree(  # noqa: C901
 
 def collect_topic_info(filtered_topics, removed_topics, aspect_counts, full_tree):
     """Gather various bits of information into a single DataFrame.
-    
+
     For each topic we store the importance, the list of associated raw text terms and their numbers.
     :param filtered_topics: List of topics remaining after filtering out low-iimportance subsidiary topics
     :type filtered_topics: [str]
@@ -350,7 +350,7 @@ def collect_topic_info(filtered_topics, removed_topics, aspect_counts, full_tree
 
 def has_connection(term, prior, full_tree):
     """Check if two terms are connected within the directed hyopernym graph.
-    
+
     :param term: first node to test
     :type term: str
     :param prior: second node to test
@@ -369,7 +369,7 @@ def has_connection(term, prior, full_tree):
 
 def filter_aggregates(topics, tree):
     """Filter the importance-sorted list, so that each remaining topic is the sole member of its hypernym chain.
-    
+
     :param topics: List of all topics in the graph
     :type topics: [str]
     :param tree: the graph which is being traversed
@@ -398,7 +398,7 @@ def filter_aggregates(topics, tree):
 
 def get_topics(dataframe_aspects, vectors):
     """Generate the semantically clustered topics from the raw aspects.
-    
+
     :param dataframe_aspects: the collection of nouns to be aggregated into topics
     :type dataframe_aspects: :class:`pandas.DataFrame`
     :param vectors: provides embeddings for context clustering and wordsense disammbguation

diff --git a/extra_model/_vectorizer.py b/extra_model/_vectorizer.py
@@ -12,7 +12,7 @@ class Vectorizer:
     def __init__(self, embedding_file):
         """
         Use the generic gensim vector embedding lookup.
-        
+
         Currently using pretrained glove embeddings, but anything goes.
         :param embedding_file: pathname for the file that stores the word-embeddings in gensim keyed-vectors format
         :type str
@@ -25,7 +25,7 @@ def __init__(self, embedding_file):
     def get_vector(self, key):
         """
         Return the vector embedding for a given word.
-        
+
         According to the following logic:
             - if no embedding is found for this word, check if it's a compound
             - if it's a compound try to take the average embedding of the constituent words

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,5 +1,5 @@
 bandit==1.7.0
-black==19.10b0
+black==20.8b1
 flake8==3.9.0
 isort==5.7.0
 mypy==0.812

diff --git a/tests/test_topics.py b/tests/test_topics.py
@@ -138,25 +138,31 @@ def test__aggregate(vec):
 
 
 def test__traverse_tree__down_weighted(simple_graph):
-    assert traverse_tree(
-        [("R", 1)],
-        {},
-        {"L1": 4, "L2": 1},
-        simple_graph,
-        weighted=True,
-        direction="down",
-    ) == {"L1": 2, "L2": 0.5}
+    assert (
+        traverse_tree(
+            [("R", 1)],
+            {},
+            {"L1": 4, "L2": 1},
+            simple_graph,
+            weighted=True,
+            direction="down",
+        )
+        == {"L1": 2, "L2": 0.5}
+    )
 
 
 def test__traverse_tree__down_unweighted(simple_graph):
-    assert traverse_tree(
-        [("R", 1)],
-        {},
-        {"L1": 4, "L2": 1},
-        simple_graph,
-        weighted=False,
-        direction="down",
-    ) == {"L1": 4, "L2": 1}
+    assert (
+        traverse_tree(
+            [("R", 1)],
+            {},
+            {"L1": 4, "L2": 1},
+            simple_graph,
+            weighted=False,
+            direction="down",
+        )
+        == {"L1": 4, "L2": 1}
+    )
 
 
 def test__traverse_tree__up_weighted(simple_graph):
@@ -166,14 +172,17 @@ def test__traverse_tree__up_weighted(simple_graph):
 
 
 def test__traverse_tree__up_unweighted(simple_graph):
-    assert traverse_tree(
-        [("I1", 1)],
-        {},
-        {"L1": 4, "L2": 1},
-        simple_graph,
-        weighted=False,
-        direction="up",
-    ) == {"L1": 4}
+    assert (
+        traverse_tree(
+            [("I1", 1)],
+            {},
+            {"L1": 4, "L2": 1},
+            simple_graph,
+            weighted=False,
+            direction="up",
+        )
+        == {"L1": 4}
+    )
 
 
 @pytest.mark.skip(