Commit c921e7ad authored by Jim Hoekstra's avatar Jim Hoekstra 👋🏻
Browse files

Merge branch 'issue/MSX-34' into 'develop'

Issue MSX-34

See merge request !12
parents 82674525 39a7510d
# -*- coding: utf-8 -*-
Created on Wed Mar 31 12:09:38 2021
@author: Julian Bianco-Martinez
Graph Expansion by Intersection of Concepts (GEIC)
import pandas as pd
import numpy as np
def concepts_extraction_list(lst1):
Extract concepts from word2vec output
lst3 = [value[0] for value in lst1]
return lst3
def GEIC(concepts, words_to_exclude = [], topX_intersect_concepts = 5, topXsimilarConcepts = 100, threshold = 0.5 ):
From a set of concepts = C0, this algorithm collects and weight similar concepts that intersect with concepts in C0
concepts: Original Concepts (C0)
words_to_exclude: The code will remove new concepts that are duplicated in this list. If it is empty words_to_exclude = concepts
topX_intersect_concepts: Only collect the top X concepts found from the intersection of similar concepts between pairs of original concepts C0. ie.
C0(i) int C0(j) = IC(i,j)[::topX_intersect_concepts]
topXsimilarConcepts: Retrieve top X similar concepts per concept in C0
threshold: cut of threshold that retrieves only intersected concepts that apear in theshold * 100 percent of all combine C0 pairs.
Data frame with 4 columns:
Column 1: C0(i)
Column 2: C0(j)
Column 3: IC(i,j)[::topX_intersect_concepts]
Column 4: Importance of the intersected concept defined as the percentage of time the concept appear in pairwise combination of C0 concepts.
if len(words_to_exclude) == 0: # words_to_exclude = concepts
words_to_exclude = concepts.copy()
df_temp = pd.DataFrame({'Concept 1' : [], 'Concept 2' :[], 'Intersection' : []})
# Creation of triangular data (due to symmetry).
for i in range(len(concepts)-1):
for j in range(i+1, len(concepts)):
concepts1 = concepts_extraction_list(model.most_similar(concepts[i], topn=topXsimilarConcepts))
concepts2 = concepts_extraction_list(model.most_similar(concepts[j], topn=topXsimilarConcepts))
inter = [v for v in concepts1 if v in concepts2][0:topX_intersect_concepts]
if len(inter) > 0:
df_temp = df_temp.append(pd.DataFrame({'Concept 1' : concepts[i],
'Concept 2' : concepts[j],
'Intersection' : inter}))
df_extension = df_temp
#[v for v in df_extention['Intersect'].values if v not in words_to_exclude]
#Remove words that contain less than 4 characters.
logical_temp = [True if len(v) > 3 else False for v in df_extension['Intersection'].values ]
df_extension = df_extension[logical_temp]
#Weight Creation
weights = df_temp['Intersection'].value_counts().rename_axis(['Intersection']).reset_index(name='weight')
weights['weight'] = weights['weight'] /(0.5 * np.math.factorial(len(concepts))/(np.math.factorial(len(concepts)-2) )) #Use for normalization.
logical_temp = [True if v not in words_to_exclude else False for v in df_extension['Intersection'].values ]
df_extension = df_extension[logical_temp]
df_extension = df_extension.merge(weights, on = "Intersection")
df_extension = df_extension.loc[df_extension['weight'] > threshold]
concepts = ['king', 'queen', 'prince']
GEIC(concepts, topX_intersect_concepts = 15)
......@@ -39,7 +39,7 @@ def update_base_word(submit_word_button, base_word_input):
State(component_id='base-word-input', component_property='value'),
State(component_id='add-word-input', component_property='value'),
State(component_id='graph-elements-div', component_property='children'),
State(component_id='msx-graph', component_property='tapNodeData'),
State(component_id='msx-graph', component_property='selectedNodeData'),
State(component_id='base-word-div', component_property='children'),
......@@ -74,14 +74,16 @@ def update_graph_elements(submit_word_button, add_word_button, extend_graph_butt
if button_id == 'remove-word-button':
if selected_nodes is not None:
selected_word = selected_nodes['label']
if selected_word in graph.get_all_words() and selected_word != base_word_state:
new_nodes_and_edges = json.dumps(graph.get_nodes_and_edges())
return new_nodes_and_edges
raise PreventUpdate
if len(selected_nodes) > 0:
for selected_node in selected_nodes:
selected_word = selected_node['id']
if selected_word in graph.get_all_words() and selected_word != base_word_state:
raise PreventUpdate
new_nodes_and_edges = json.dumps(graph.get_nodes_and_edges())
return new_nodes_and_edges
raise PreventUpdate
......@@ -5,7 +5,7 @@ import stringdist as sdi
class AssociatedWords:
def __init__(self):
print("\n Word2Vec model is loading.This can take a couple of minutes.")
print("\n Word2Vec model is loading. This can take a couple of minutes.")
self.model = api.load('glove-twitter-200')
print(" Word2Vec model is ready. Enjoy!!!\n")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment