issues MSX-29 and MSX-30

d74e6bc2 · Jim Hoekstra · beeb826a · beeb826a · d74e6bc2 · d74e6bc2
Commit d74e6bc2 authored 4 years ago by Jim Hoekstra
--- a/Julian_BM/w2v_stringdist_cyto.py
+++ b/Julian_BM/w2v_stringdist_cyto.py
-# -*- coding: utf-8 -*-
-"""
-Staring date: 09-02-21
-
-Code for MSX: Extension of candidate words. 
-Functions: 
-    words_to_sets: Check existent word2vec model for words similarities. Also
-        prune the similar words by string distances. 
-            Input: set of words
-            Output: dictionary with original + similar words + similarity scores
-    sets_to_network: Converts part of the output of words_to_sets function into 2 objects
-        to be loaded in dash-cytoscape.
-            Input: dictionary with original + similar words
-            Ouput: two arrays to be used in dash-cyto
-
-Codes include the use of an existent 
-word2vec pretrained model and string/substring distance.
-    
-Author: Julian Bianco Martinez
-"""
-### Loading model
-import gensim.downloader as api
-
-# Different Pre-trained Embeddings
-#model = api.load('word2vec-google-news-300')
-model = api.load('glove-twitter-200')
-model.most_similar('people', topn=100)
-
-terms = """Jargon
-Domain
-Standardisation
-Interactive tool
-word2vec
-NLP
-Collaboration
-Vocabulary
-Graph
-Meaning
-Tool
-Communication
-People
-Terms
-Associations
-Efficient"""
-words_set = terms
-
-#%% Functions
-def words_to_sets(words_set):
- # Similar words
- from tqdm import tqdm
- import stringdist as sdi
- terms = words_set.lower().split("\n")
- terms = [word.replace(' ','_') for word in terms]
- 
- similar_terms = dict()
- words_not_vocabulary = []
- for term in tqdm(terms):
-     try:
-         temp_tuple =  model.most_similar(term, topn=20)
-         r=-1
-         for i,j in temp_tuple:
-             r += 1 
-             temp_tuple[r] = temp_tuple[r] + (sdi.rdlevenshtein_norm(term,i),) + (i.find(term),) 
-         similar_terms[term] = temp_tuple
-         similar_terms[term] = [(term, score, string_dist, substring) for term, score, string_dist, substring in similar_terms[term] if term == term.lower() 
-         and "www" not in term 
-         and string_dist > 0.5 
-         and substring == -1][:10]
-     except (KeyError) as e:
-         words_not_vocabulary.append(term)         
- return similar_terms, words_not_vocabulary
-
-
-def sets_to_network(similar_terms):
-    ### CytoScape structure Creation
- edges = []
- nodes = []
- for item in similar_terms:
-     if item not in nodes:
-         nodes.append({"data": {"id": item, "label": item},"classes" : 'followerNode'})
-     for i in similar_terms[item]:
-         if i not in nodes:
-             nodes.append({"data": {"id": i[0], "label": i[0]},"classes" : 'followingNode'})
-         temp = {'data':{
-                  'source': item,
-                  'target': i[0]
-                  }
-         }
-         edges.append(temp)
-         
- return nodes, edges
- 
-
-similar_terms, words_not_vocabulary = words_to_sets(terms)
-nodes, edges = sets_to_network(similar_terms)       
-
-### END Functions
-
-#%% Dash
-import dash
-import dash_cytoscape as cyto
-import dash_html_components as html
-
-#####Cyto Code
-        
-app = dash.Dash(__name__)
-
-# define layout
-default_stylesheet = [
-    {
-        "selector": 'node',
-        "style":{
-                "label": "data(label)"
-        }
-    },
-    {
-        'selector': '.followerNode',
-        'style': {
-            'background-color': '#00AAD9'
-        }
-    }
-]
-
-app.layout = html.Div([
-    cyto.Cytoscape(
-        id='test',
-        #style={'width': '100%', 'height': '400px'},
-        layout = {'name': 'cose'},
-        elements = edges + nodes,
-        stylesheet=default_stylesheet
-    )
-])
-
-if __name__ == '__main__':
-    app.run_server(debug=False)
-    
- 
\ No newline at end of file
--- a/dash_app/callbacks.py
+++ b/dash_app/callbacks.py
@@ -34,6 +34,7 @@ def update_base_word(submit_word_button, base_word_input):
    Output(component_id='graph-elements-div', component_property='children'),
    Input(component_id='submit-word-button', component_property='n_clicks'),
    Input(component_id='add-word-button', component_property='n_clicks'),
+    Input(component_id='extend-graph-button', component_property='n_clicks'),
    Input(component_id='remove-word-button', component_property='n_clicks'),
    State(component_id='base-word-input', component_property='value'),
    State(component_id='add-word-input', component_property='value'),
@@ -42,8 +43,8 @@ def update_base_word(submit_word_button, base_word_input):
    State(component_id='base-word-div', component_property='children'),
    prevent_initial_call=True
 )
-def update_graph_elements(submit_word_button, add_word_button, remove_word_button, base_word_input, add_word_input,
-                          nodes_and_edges, selected_nodes, base_word_state):
+def update_graph_elements(submit_word_button, add_word_button, extend_graph_button, remove_word_button, base_word_input,
+                          add_word_input, nodes_and_edges, selected_nodes, base_word_state):

    callback_context = dash.callback_context
    button_id = callback_context.triggered[0]['prop_id'].split('.')[0]
@@ -65,6 +66,12 @@ def update_graph_elements(submit_word_button, add_word_button, remove_word_butto
        else:
            raise PreventUpdate

+    if button_id == 'extend-graph-button':
+        graph.set_nodes_and_edges(json.loads(nodes_and_edges))
+        graph.extend_graph(word2vec_model, base_word_state)
+        new_nodes_and_edges = graph.get_nodes_and_edges()
+        return json.dumps(new_nodes_and_edges)
+
    if button_id == 'remove-word-button':
        graph.set_nodes_and_edges(json.loads(nodes_and_edges))
        if selected_nodes is not None:
@@ -93,8 +100,14 @@ def update_graph(nodes_and_edges):
 @app.callback(
    Output(component_id='msx-graph', component_property='autoRefreshLayout'),
    Input(component_id='add-word-button', component_property='n_clicks'),
+    Input(component_id='extend-graph-button', component_property='n_clicks'),
    Input(component_id='remove-word-button', component_property='n_clicks'),
    prevent_initial_call=True
 )
-def set_auto_refresh_layout(add_word_button, remove_word_button):
-    return False
+def set_auto_refresh_layout(add_word_button, extend_graph_button, remove_word_button):
+    callback_context = dash.callback_context
+    button_id = callback_context.triggered[0]['prop_id'].split('.')[0]
+    if button_id == 'add-word-button' or button_id == 'remove-word-button':
+        return False
+    if button_id == 'extend-graph-button':
+        return True
--- a/dash_app/graph.py
+++ b/dash_app/graph.py
@@ -6,6 +6,8 @@ class Graph:
    def __init__(self):
        self.nodes = []
        self.edges = []
+        self.COUNT_THRESHOLD = 2
+        self.MAX_NUM_WORDS = 10

    def get_all_words(self):
        all_words = [node_dict['data']['label'] for node_dict in self.nodes]
@@ -99,3 +101,21 @@ class Graph:
                                  }
                              ]
                              )
+
+    def extend_graph(self, word2vec_model, base_node):
+        current_words = [node['data']['id'] for node in self.nodes]
+        all_associated_words = []
+        for current_word in current_words:
+            associated_words = word2vec_model.get_associated_words(current_word, top_n=100)
+            all_associated_words.extend(associated_words)
+
+        associated_words_filtered = [word for word in all_associated_words if word not in current_words]
+        associated_words_count = {word: associated_words_filtered.count(word) for word in list(set(associated_words_filtered))}
+
+        common_associated_words = [word for word, count in associated_words_count.items() if count >= self.COUNT_THRESHOLD]
+        while len(common_associated_words) > self.MAX_NUM_WORDS:
+            self.COUNT_THRESHOLD += 1
+            common_associated_words = [word for word, count in associated_words_count.items() if count >= self.COUNT_THRESHOLD]
+
+        self.add_nodes(common_associated_words)
+        self.add_edges(base_node, common_associated_words)
--- a/dash_app/layout.py
+++ b/dash_app/layout.py
@@ -44,7 +44,10 @@ layout = html.Div(children=[
                html.Div(className='col-2', children=[
                    html.Button(id='add-word-button', n_clicks_timestamp=0, children='Add Association', className='btn btn-success btn-lg'),
                ]),
-                html.Div(className='col-4', children=[
+                html.Div(className='col-2', children=[
+                    html.Button(id='extend-graph-button', n_clicks_timestamp=0, children='Extend Graph', className='btn btn-success btn-lg'),
+                ]),
+                html.Div(className='col-2', children=[
                ]),
                html.Div(className='col-3', children=[
                    html.Button(id='remove-word-button', n_clicks_timestamp=0, children='Remove Selected Association', className='btn btn-danger btn-lg')]),

--- a/dash_app/words.py
+++ b/dash_app/words.py
@@ -5,13 +5,12 @@ import stringdist as sdi
 class AssociatedWords:

    def __init__(self):
-        self.N_RESULTS = 10
        print("\n Word2Vec model is loading.This can take a couple of minutes.")
        self.model = api.load('glove-twitter-200')
        print(" Word2Vec model is ready. Enjoy!!!\n")

-    def get_associated_words(self, word):
-        gensim_result = self.model.most_similar(word, topn=self.N_RESULTS)
+    def get_associated_words(self, word, top_n=10):
+        gensim_result = self.model.most_similar(word, topn=top_n)
        # gensim_result = [('apple', 1.0), ('banana', 1.0), ('strawberry', 1.0)]
        words = self.filter_results(gensim_result, word)
        return words