From a828d9930f9be493a2c1f4bc76d7f2134ad9c0ad Mon Sep 17 00:00:00 2001 From: Moene <arnold.moene@wur.nl> Date: Mon, 10 Jan 2022 23:47:38 +0100 Subject: [PATCH] added survey-helpers for the routines (and removed routines from notebook) --- programme_name_analysis.ipynb | 199 +--------------------------------- survey-helpers.py | 194 +++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 197 deletions(-) create mode 100644 survey-helpers.py diff --git a/programme_name_analysis.ipynb b/programme_name_analysis.ipynb index 1e9b4d5..227bc0a 100644 --- a/programme_name_analysis.ipynb +++ b/programme_name_analysis.ipynb @@ -17,6 +17,7 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from toggle_cell import toggle_code as hide_code\n", + "from survey-helpers import q_number, q_name, plot_pies, plot_bar, plot_ranking\n", "%matplotlib inline" ] }, @@ -28,203 +29,7 @@ "source": [ "# Get the data from file and define short-hand names for columns\n", "fname='Survey name of programmes BSW & MEE(1-178).xlsx'\n", - "df = pd.read_excel(fname)\n", - "\n", - "hide_code()\n", - "q_number = {'curr_prog': 7,\n", - " 'did_BSW': 10,\n", - " 'attract_name_BSc': 13,\n", - " 'year_BSW': 16,\n", - " 'year_MEE': 19,\n", - " 'year_MEE2': 22,\n", - " 'attract_name_BSW': 28,\n", - " 'attract_name_MEE': 31,\n", - " 'attract_name_MEE2': 34,\n", - " 'miss_name_BSW': 37,\n", - " 'miss_name_BSW2': 40,\n", - " 'miss_name_MEE': 43,\n", - " 'miss_name_MEE2': 46,\n", - " 'public_words': 49,\n", - " 'recgnize_unique': 54,\n", - " 'broad_specific': 59,\n", - " 'order_synonym': 62,\n", - " 'order_environment': 65,\n", - " 'order_earth': 68,\n", - " 'which_order': 71,\n", - " 'prefix': 74,\n", - " 'comment_name': 77,\n", - " 'comment_survey': 80}\n", - "q_name = {'curr_prog': 'Programme',\n", - " 'did_BSW': 'Studied BSW before MEE',\n", - " 'attract_name_BSc': 13,\n", - " 'year_BSW': 'Year in BSW',\n", - " 'year_MEE': 'Year in MEE',\n", - " 'year_MEE2': 'Year in MEE',\n", - " 'attract_name_BSW': 28,\n", - " 'attract_name_MEE': 31,\n", - " 'attract_name_MEE2': 34,\n", - " 'miss_name_BSW': 37,\n", - " 'miss_name_BSW2': 40,\n", - " 'miss_name_MEE': 43,\n", - " 'miss_name_MEE2': 46,\n", - " 'public_words': 49,\n", - " 'recgnize_unique': 'Recognizable or unique',\n", - " 'broad_specific': 'Broad or specific',\n", - " 'order_synonym': 62,\n", - " 'order_environment': 65,\n", - " 'order_earth': 68,\n", - " 'which_order': 'Which order preferred',\n", - " 'prefix': 'Which prefix',\n", - " 'comment_name': 77,\n", - " 'comment_survey': 80}\n", - "\n", - "# Combine some columns\n", - "def combine_cols(mydf, ser1_name, ser2_name, sernew_name, my_qnumber, my_qname, description):\n", - " # Add new column\n", - " mydf[sernew_name] = np.NaN\n", - " # Update info dictionaries\n", - " ncols = len(mydf.columns) - 1 # series was added to last column\n", - " q_number.update({sernew_name: ncols})\n", - " q_name.update({sernew_name: description})\n", - " \n", - " ser1 = mydf.iloc[:,q_number[ser1_name]]\n", - " ser2 = mydf.iloc[:,q_number[ser2_name]]\n", - " ser1d = ser1[ser1.notna()]\n", - " ser2d = ser2[ser2.notna()]\n", - " for i in ser1d.index.values:\n", - " mydf.iloc[i, q_number[sernew_name]] = ser1[i]\n", - " for i in ser2d.index.values:\n", - " mydf.iloc[i, q_number[sernew_name]] = ser2[i]\n", - " return mydf, my_qnumber, my_qname\n", - "\n", - "def plot_pies(vars):\n", - " fig, ax = plt.subplots(1, len(vars))\n", - " for i, var in enumerate(vars):\n", - " num_sample = df.iloc[:,q_number[var]].count()\n", - " counts = df.iloc[:,q_number[var]].value_counts(sort=True)\n", - " counts.sort_index(inplace=True, ascending=False)\n", - " counts.plot(kind=\"pie\", ax=ax[i], label=\"\", figsize=(15,15))\n", - " ax[i].set_title(q_name[var]+\" (N=%i)\"%(num_sample))\n", - " \n", - "def plot_bar(mydf, split_by, vars, pref_middle=None, include_all = True):\n", - " split_options = mydf.iloc[:,q_number[split_by]]\n", - " counts = split_options.value_counts()\n", - " split_options = counts.keys().tolist()\n", - " if (include_all):\n", - " split_options.insert(0,'all') \n", - "\n", - " fig, ax = plt.subplots(1, len(vars),figsize=(5*len(vars),5))\n", - " if (len(vars) == 1):\n", - " ax=[ax]\n", - " width = 1/(len(split_options)+1)\n", - " for i, var in enumerate(vars):\n", - " for j, prog in enumerate(split_options):\n", - " if (prog == 'all'):\n", - " series = mydf.iloc[:,q_number[var]]\n", - " n_sample = mydf.iloc[:,q_number[split_by]].notna().sum()\n", - " else:\n", - " series = mydf.iloc[:,q_number[var]][df.iloc[:,q_number[split_by]] == prog]\n", - " n_sample = len(series)\n", - "\n", - " counts = series.value_counts()\n", - " if (prog == 'all'):\n", - " counts.sort_index(inplace=True)\n", - " all_keys = counts.keys().tolist()\n", - " if (pref_middle):\n", - " # Where is the middle one now ?\n", - " if (len(all_keys) % 2 == 0):\n", - " print('There is no middle in a list with even number of items')\n", - " if (pref_middle in all_keys):\n", - " cur_loc = all_keys.index(pref_middle)\n", - " mid_loc = int(len(all_keys)/2-0.5)\n", - " tmp = all_keys[mid_loc]\n", - " all_keys[mid_loc] = pref_middle\n", - " all_keys[cur_loc] = tmp\n", - " else:\n", - " print('Requested %s not in keys'%(pref_middle))\n", - " mylist = ['a', 'b', 'c', 'd', 'e']\n", - " myorder = [3, 2, 0, 1, 4]\n", - " mylist = [mylist[i] for i in myorder]\n", - " x = np.arange(len(all_keys))\n", - " y = np.zeros(len(all_keys))\n", - " y = y*0\n", - " for k, key in enumerate(all_keys):\n", - " if (key in counts.keys()):\n", - " y[k] = counts[key]\n", - " n_sample = y.sum()\n", - " ax[i].barh(x + width - j*width, y/n_sample, width, label = prog+' (N=%i)'%(n_sample))\n", - " if (j == len(split_options)-1):\n", - " ax[i].set_yticks(x)\n", - " ax[i].set_yticklabels(all_keys)\n", - " ax[i].set_xlabel('frequency')\n", - "\n", - " ax[i].legend(bbox_to_anchor=(1, 1), loc='upper left', title=q_name[split_by])\n", - " if (len(vars) > 1):\n", - " fig.tight_layout()\n", - " \n", - "def plot_ranking(mydf, split_by, orders, include_all = True, transpose=False):\n", - " if (transpose and (len(orders)> 1)):\n", - " print(\"Can not use transpose when more than one ordering should be plotted\")\n", - " return\n", - " split_options = mydf.iloc[:,q_number[split_by]]\n", - " counts = split_options.value_counts()\n", - " split_options = counts.keys().tolist()\n", - " if (include_all):\n", - " split_options.insert(0,'all') \n", - " if (transpose):\n", - " plt.figure(figsize=(6*len(split_options),5))\n", - " for k, prog in enumerate(split_options):\n", - " if (not transpose):\n", - " plt.figure(figsize=(6*len(orders),5))\n", - " if (transpose):\n", - " plt.subplot(1,len(split_options),k+1)\n", - " for i, order in enumerate(orders):\n", - " if (not transpose):\n", - " plt.subplot(1,len(orders),i+1)\n", - " # Get the series for this ordering\n", - " if (prog == 'all'):\n", - " set = mydf.iloc[:,q_number[order]]\n", - " n_sample = mydf.iloc[:,q_number[split_by]].notna().sum()\n", - " else:\n", - " set = mydf.iloc[:,q_number[order]][df.iloc[:,q_number[split_by]] == prog]\n", - " n_sample = set.shape[0]\n", - " # Split into a series for each option\n", - " set = set.str.split(';',expand=True)\n", - " options = set.iloc[0,:]\n", - " # Drop the empty one (after the last semi colon)\n", - " options.drop(options.index[options==''], inplace=True)\n", - " # Sort so that we always have the same order\n", - " options.sort_index(inplace=True)\n", - "\n", - " for j, opt in enumerate(options):\n", - " set[opt] = 0\n", - " dummy = np.zeros(set.shape[0], dtype=np.int8)\n", - " for m in range(len(options)):\n", - " this_rank = (set.iloc[:,m] == options[j])\n", - " dummy[this_rank] = m+1\n", - " set.iloc[:, set.columns.get_loc(opt)] = dummy\n", - "\n", - " for j, opt in enumerate(options):\n", - " counts = ((set[options] == j+1).sum())/n_sample\n", - " counts.sort_index(inplace=True, ascending=False)\n", - " if (j == 0):\n", - " prev_counts = 0*counts\n", - " plt.barh(counts.keys(), counts.values, left = prev_counts.values, label=\"%i\"%(j+1))\n", - " prev_counts += counts\n", - " plt.legend(bbox_to_anchor=(1, 1), loc='upper left', title='Rank', labelspacing=0.3)\n", - " plt.xlim(0,1)\n", - " plt.xlabel('fraction of respondents')\n", - " plt.tight_layout()\n", - " plt.title('%s: %s'%(q_name[split_by],prog))\n", - " \n", - "df, q_number, q_name = combine_cols(df, 'year_MEE', 'year_MEE2', 'year_in_MEE_total', \n", - " q_number, q_name, 'Year in MEE')\n", - "df, q_number, q_name = combine_cols(df, 'attract_name_MEE', 'attract_name_MEE2', 'attract_name_MEE_total', \n", - " q_number, q_name, 'Attractive in name MEE')\n", - "df, q_number, q_name = combine_cols(df, 'miss_name_BSW', 'miss_name_BSW2', 'miss_name_BSW_total', \n", - " q_number, q_name, 'Missed in name BSW')\n", - "df, q_number, q_name = combine_cols(df, 'miss_name_MEE', 'miss_name_MEE2', 'miss_name_MEE_total', \n", - " q_number, q_name, 'Missed in name MEE')\n" + "df = pd.read_excel(fname)" ] }, { diff --git a/survey-helpers.py b/survey-helpers.py new file mode 100644 index 0000000..4a9c26c --- /dev/null +++ b/survey-helpers.py @@ -0,0 +1,194 @@ +q_number = {'curr_prog': 7, + 'did_BSW': 10, + 'attract_name_BSc': 13, + 'year_BSW': 16, + 'year_MEE': 19, + 'year_MEE2': 22, + 'attract_name_BSW': 28, + 'attract_name_MEE': 31, + 'attract_name_MEE2': 34, + 'miss_name_BSW': 37, + 'miss_name_BSW2': 40, + 'miss_name_MEE': 43, + 'miss_name_MEE2': 46, + 'public_words': 49, + 'recgnize_unique': 54, + 'broad_specific': 59, + 'order_synonym': 62, + 'order_environment': 65, + 'order_earth': 68, + 'which_order': 71, + 'prefix': 74, + 'comment_name': 77, + 'comment_survey': 80} +q_name = {'curr_prog': 'Programme', + 'did_BSW': 'Studied BSW before MEE', + 'attract_name_BSc': 'What attracted you in the name of your (non- BSW) BSc', + 'year_BSW': 'Year in BSW', + 'year_MEE': 'Year in MEE 1', + 'year_MEE2': 'Year in MEE 2', + 'attract_name_BSW': 'What attracted you in the name BSW', + 'attract_name_MEE': 'What attracted you in the name BSW 1', + 'attract_name_MEE2': 'What attracted you in the name BSW 2', + 'miss_name_BSW': 'What did you miss in the name BSW 1', + 'miss_name_BSW2': 'What did you miss in the name BSW 2', + 'miss_name_MEE': 'What did you miss in the name MEE 1', + 'miss_name_MEE2': 'What did you miss in the name MEE 2', + 'public_words': 'Which words should be included', + 'recgnize_unique': 'Recognizable or unique', + 'broad_specific': 'Broad or specific', + 'order_synonym': 'Order of synonyms', + 'order_environment': 'Order of names with environment', + 'order_earth': 'Order of names with Earth', + 'which_order': 'Which order preferred', + 'prefix': 'Which prefix', + 'comment_name': 'Comment programme name', + 'comment_survey': 'Comment survey'} + +# Combine some columns +def combine_cols(mydf, ser1_name, ser2_name, sernew_name, my_qnumber, my_qname, description): + # Add new column + mydf[sernew_name] = np.NaN + # Update info dictionaries + ncols = len(mydf.columns) - 1 # series was added to last column + q_number.update({sernew_name: ncols}) + q_name.update({sernew_name: description}) + + ser1 = mydf.iloc[:,q_number[ser1_name]] + ser2 = mydf.iloc[:,q_number[ser2_name]] + ser1d = ser1[ser1.notna()] + ser2d = ser2[ser2.notna()] + for i in ser1d.index.values: + mydf.iloc[i, q_number[sernew_name]] = ser1[i] + for i in ser2d.index.values: + mydf.iloc[i, q_number[sernew_name]] = ser2[i] + return mydf, my_qnumber, my_qname + +def plot_pies(vars): + fig, ax = plt.subplots(1, len(vars)) + for i, var in enumerate(vars): + num_sample = df.iloc[:,q_number[var]].count() + counts = df.iloc[:,q_number[var]].value_counts(sort=True) + counts.sort_index(inplace=True, ascending=False) + counts.plot(kind="pie", ax=ax[i], label="", figsize=(15,15)) + ax[i].set_title(q_name[var]+" (N=%i)"%(num_sample)) + +def plot_bar(mydf, split_by, vars, pref_middle=None, include_all = True): + split_options = mydf.iloc[:,q_number[split_by]] + counts = split_options.value_counts() + split_options = counts.keys().tolist() + if (include_all): + split_options.insert(0,'all') + + fig, ax = plt.subplots(1, len(vars),figsize=(5*len(vars),5)) + if (len(vars) == 1): + ax=[ax] + width = 1/(len(split_options)+1) + for i, var in enumerate(vars): + for j, prog in enumerate(split_options): + if (prog == 'all'): + series = mydf.iloc[:,q_number[var]] + n_sample = mydf.iloc[:,q_number[split_by]].notna().sum() + else: + series = mydf.iloc[:,q_number[var]][df.iloc[:,q_number[split_by]] == prog] + n_sample = len(series) + + counts = series.value_counts() + if (prog == 'all'): + counts.sort_index(inplace=True) + all_keys = counts.keys().tolist() + if (pref_middle): + # Where is the middle one now ? + if (len(all_keys) % 2 == 0): + print('There is no middle in a list with even number of items') + if (pref_middle in all_keys): + cur_loc = all_keys.index(pref_middle) + mid_loc = int(len(all_keys)/2-0.5) + tmp = all_keys[mid_loc] + all_keys[mid_loc] = pref_middle + all_keys[cur_loc] = tmp + else: + print('Requested %s not in keys'%(pref_middle)) + mylist = ['a', 'b', 'c', 'd', 'e'] + myorder = [3, 2, 0, 1, 4] + mylist = [mylist[i] for i in myorder] + x = np.arange(len(all_keys)) + y = np.zeros(len(all_keys)) + y = y*0 + for k, key in enumerate(all_keys): + if (key in counts.keys()): + y[k] = counts[key] + n_sample = y.sum() + ax[i].barh(x + width - j*width, y/n_sample, width, label = prog+' (N=%i)'%(n_sample)) + if (j == len(split_options)-1): + ax[i].set_yticks(x) + ax[i].set_yticklabels(all_keys) + ax[i].set_xlabel('frequency') + + ax[i].legend(bbox_to_anchor=(1, 1), loc='upper left', title=q_name[split_by]) + if (len(vars) > 1): + fig.tight_layout() + +def plot_ranking(mydf, split_by, orders, include_all = True, transpose=False): + if (transpose and (len(orders)> 1)): + print("Can not use transpose when more than one ordering should be plotted") + return + split_options = mydf.iloc[:,q_number[split_by]] + counts = split_options.value_counts() + split_options = counts.keys().tolist() + if (include_all): + split_options.insert(0,'all') + if (transpose): + plt.figure(figsize=(6*len(split_options),5)) + for k, prog in enumerate(split_options): + if (not transpose): + plt.figure(figsize=(6*len(orders),5)) + if (transpose): + plt.subplot(1,len(split_options),k+1) + for i, order in enumerate(orders): + if (not transpose): + plt.subplot(1,len(orders),i+1) + # Get the series for this ordering + if (prog == 'all'): + set = mydf.iloc[:,q_number[order]] + n_sample = mydf.iloc[:,q_number[split_by]].notna().sum() + else: + set = mydf.iloc[:,q_number[order]][df.iloc[:,q_number[split_by]] == prog] + n_sample = set.shape[0] + # Split into a series for each option + set = set.str.split(';',expand=True) + options = set.iloc[0,:] + # Drop the empty one (after the last semi colon) + options.drop(options.index[options==''], inplace=True) + # Sort so that we always have the same order + options.sort_index(inplace=True) + + for j, opt in enumerate(options): + set[opt] = 0 + dummy = np.zeros(set.shape[0], dtype=np.int8) + for m in range(len(options)): + this_rank = (set.iloc[:,m] == options[j]) + dummy[this_rank] = m+1 + set.iloc[:, set.columns.get_loc(opt)] = dummy + + for j, opt in enumerate(options): + counts = ((set[options] == j+1).sum())/n_sample + counts.sort_index(inplace=True, ascending=False) + if (j == 0): + prev_counts = 0*counts + plt.barh(counts.keys(), counts.values, left = prev_counts.values, label="%i"%(j+1)) + prev_counts += counts + plt.legend(bbox_to_anchor=(1, 1), loc='upper left', title='Rank', labelspacing=0.3) + plt.xlim(0,1) + plt.xlabel('fraction of respondents') + plt.tight_layout() + plt.title('%s: %s'%(q_name[split_by],prog)) + +df, q_number, q_name = combine_cols(df, 'year_MEE', 'year_MEE2', 'year_in_MEE_total', + q_number, q_name, 'Year in MEE') +df, q_number, q_name = combine_cols(df, 'attract_name_MEE', 'attract_name_MEE2', 'attract_name_MEE_total', + q_number, q_name, 'Attractive in name MEE') +df, q_number, q_name = combine_cols(df, 'miss_name_BSW', 'miss_name_BSW2', 'miss_name_BSW_total', + q_number, q_name, 'Missed in name BSW') +df, q_number, q_name = combine_cols(df, 'miss_name_MEE', 'miss_name_MEE2', 'miss_name_MEE_total', + q_number, q_name, 'Missed in name MEE') -- GitLab