From a828d9930f9be493a2c1f4bc76d7f2134ad9c0ad Mon Sep 17 00:00:00 2001
From: Moene <arnold.moene@wur.nl>
Date: Mon, 10 Jan 2022 23:47:38 +0100
Subject: [PATCH] added survey-helpers for the routines (and removed routines
 from notebook)

---
 programme_name_analysis.ipynb | 199 +---------------------------------
 survey-helpers.py             | 194 +++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+), 197 deletions(-)
 create mode 100644 survey-helpers.py

diff --git a/programme_name_analysis.ipynb b/programme_name_analysis.ipynb
index 1e9b4d5..227bc0a 100644
--- a/programme_name_analysis.ipynb
+++ b/programme_name_analysis.ipynb
@@ -17,6 +17,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "from toggle_cell import toggle_code as hide_code\n",
+    "from survey-helpers import q_number, q_name, plot_pies, plot_bar, plot_ranking\n",
     "%matplotlib inline"
    ]
   },
@@ -28,203 +29,7 @@
    "source": [
     "# Get the data from file and define short-hand names for columns\n",
     "fname='Survey name of programmes BSW & MEE(1-178).xlsx'\n",
-    "df = pd.read_excel(fname)\n",
-    "\n",
-    "hide_code()\n",
-    "q_number = {'curr_prog': 7,\n",
-    " 'did_BSW': 10,\n",
-    " 'attract_name_BSc': 13,\n",
-    " 'year_BSW': 16,\n",
-    " 'year_MEE': 19,\n",
-    " 'year_MEE2': 22,\n",
-    " 'attract_name_BSW': 28,\n",
-    " 'attract_name_MEE': 31,\n",
-    " 'attract_name_MEE2': 34,\n",
-    " 'miss_name_BSW': 37,\n",
-    " 'miss_name_BSW2': 40,\n",
-    " 'miss_name_MEE': 43,\n",
-    " 'miss_name_MEE2': 46,\n",
-    " 'public_words': 49,\n",
-    " 'recgnize_unique': 54,\n",
-    " 'broad_specific': 59,\n",
-    " 'order_synonym': 62,\n",
-    " 'order_environment': 65,\n",
-    " 'order_earth': 68,\n",
-    " 'which_order': 71,\n",
-    " 'prefix': 74,\n",
-    " 'comment_name': 77,\n",
-    " 'comment_survey': 80}\n",
-    "q_name = {'curr_prog': 'Programme',\n",
-    " 'did_BSW': 'Studied BSW before MEE',\n",
-    " 'attract_name_BSc': 13,\n",
-    " 'year_BSW': 'Year in BSW',\n",
-    " 'year_MEE': 'Year in MEE',\n",
-    " 'year_MEE2': 'Year in MEE',\n",
-    " 'attract_name_BSW': 28,\n",
-    " 'attract_name_MEE': 31,\n",
-    " 'attract_name_MEE2': 34,\n",
-    " 'miss_name_BSW': 37,\n",
-    " 'miss_name_BSW2': 40,\n",
-    " 'miss_name_MEE': 43,\n",
-    " 'miss_name_MEE2': 46,\n",
-    " 'public_words': 49,\n",
-    " 'recgnize_unique': 'Recognizable or unique',\n",
-    " 'broad_specific': 'Broad or specific',\n",
-    " 'order_synonym': 62,\n",
-    " 'order_environment': 65,\n",
-    " 'order_earth': 68,\n",
-    " 'which_order': 'Which order preferred',\n",
-    " 'prefix': 'Which prefix',\n",
-    " 'comment_name': 77,\n",
-    " 'comment_survey': 80}\n",
-    "\n",
-    "# Combine some columns\n",
-    "def combine_cols(mydf, ser1_name, ser2_name, sernew_name, my_qnumber, my_qname, description):\n",
-    "    # Add new column\n",
-    "    mydf[sernew_name] = np.NaN\n",
-    "    # Update info dictionaries\n",
-    "    ncols = len(mydf.columns) - 1 # series was added to last column\n",
-    "    q_number.update({sernew_name: ncols})\n",
-    "    q_name.update({sernew_name: description})\n",
-    "    \n",
-    "    ser1 = mydf.iloc[:,q_number[ser1_name]]\n",
-    "    ser2 = mydf.iloc[:,q_number[ser2_name]]\n",
-    "    ser1d = ser1[ser1.notna()]\n",
-    "    ser2d = ser2[ser2.notna()]\n",
-    "    for i in ser1d.index.values:\n",
-    "        mydf.iloc[i, q_number[sernew_name]] = ser1[i]\n",
-    "    for i in ser2d.index.values:\n",
-    "        mydf.iloc[i, q_number[sernew_name]] = ser2[i]\n",
-    "    return mydf, my_qnumber, my_qname\n",
-    "\n",
-    "def plot_pies(vars):\n",
-    "    fig, ax = plt.subplots(1, len(vars))\n",
-    "    for i, var in enumerate(vars):\n",
-    "        num_sample = df.iloc[:,q_number[var]].count()\n",
-    "        counts = df.iloc[:,q_number[var]].value_counts(sort=True)\n",
-    "        counts.sort_index(inplace=True, ascending=False)\n",
-    "        counts.plot(kind=\"pie\", ax=ax[i], label=\"\", figsize=(15,15))\n",
-    "        ax[i].set_title(q_name[var]+\" (N=%i)\"%(num_sample))\n",
-    "    \n",
-    "def plot_bar(mydf, split_by, vars, pref_middle=None, include_all = True):\n",
-    "    split_options = mydf.iloc[:,q_number[split_by]]\n",
-    "    counts = split_options.value_counts()\n",
-    "    split_options = counts.keys().tolist()\n",
-    "    if (include_all):\n",
-    "        split_options.insert(0,'all')     \n",
-    "\n",
-    "    fig, ax = plt.subplots(1, len(vars),figsize=(5*len(vars),5))\n",
-    "    if (len(vars) == 1):\n",
-    "        ax=[ax]\n",
-    "    width = 1/(len(split_options)+1)\n",
-    "    for i, var in enumerate(vars):\n",
-    "        for j, prog in enumerate(split_options):\n",
-    "            if (prog == 'all'):\n",
-    "                series = mydf.iloc[:,q_number[var]]\n",
-    "                n_sample = mydf.iloc[:,q_number[split_by]].notna().sum()\n",
-    "            else:\n",
-    "                series = mydf.iloc[:,q_number[var]][df.iloc[:,q_number[split_by]] == prog]\n",
-    "                n_sample = len(series)\n",
-    "\n",
-    "            counts = series.value_counts()\n",
-    "            if (prog == 'all'):\n",
-    "                counts.sort_index(inplace=True)\n",
-    "                all_keys = counts.keys().tolist()\n",
-    "                if (pref_middle):\n",
-    "                    # Where is the middle one now ?\n",
-    "                    if (len(all_keys) % 2 == 0):\n",
-    "                        print('There is no middle in a list with even number of items')\n",
-    "                    if (pref_middle in all_keys):\n",
-    "                        cur_loc = all_keys.index(pref_middle)\n",
-    "                        mid_loc = int(len(all_keys)/2-0.5)\n",
-    "                        tmp = all_keys[mid_loc]\n",
-    "                        all_keys[mid_loc] = pref_middle\n",
-    "                        all_keys[cur_loc] = tmp\n",
-    "                    else:\n",
-    "                        print('Requested %s not in keys'%(pref_middle))\n",
-    "                    mylist = ['a', 'b', 'c', 'd', 'e']\n",
-    "                    myorder = [3, 2, 0, 1, 4]\n",
-    "                    mylist = [mylist[i] for i in myorder]\n",
-    "                x = np.arange(len(all_keys))\n",
-    "                y = np.zeros(len(all_keys))\n",
-    "            y = y*0\n",
-    "            for k, key in enumerate(all_keys):\n",
-    "                if (key in counts.keys()):\n",
-    "                    y[k] = counts[key]\n",
-    "            n_sample = y.sum()\n",
-    "            ax[i].barh(x + width - j*width, y/n_sample, width, label = prog+' (N=%i)'%(n_sample))\n",
-    "            if (j == len(split_options)-1):\n",
-    "                ax[i].set_yticks(x)\n",
-    "                ax[i].set_yticklabels(all_keys)\n",
-    "                ax[i].set_xlabel('frequency')\n",
-    "\n",
-    "    ax[i].legend(bbox_to_anchor=(1, 1), loc='upper left', title=q_name[split_by])\n",
-    "    if (len(vars) > 1):\n",
-    "        fig.tight_layout()\n",
-    "        \n",
-    "def plot_ranking(mydf, split_by, orders, include_all = True, transpose=False):\n",
-    "    if (transpose and (len(orders)> 1)):\n",
-    "        print(\"Can not use transpose when more than one ordering should be plotted\")\n",
-    "        return\n",
-    "    split_options = mydf.iloc[:,q_number[split_by]]\n",
-    "    counts = split_options.value_counts()\n",
-    "    split_options = counts.keys().tolist()\n",
-    "    if (include_all):\n",
-    "        split_options.insert(0,'all')     \n",
-    "    if (transpose):\n",
-    "            plt.figure(figsize=(6*len(split_options),5))\n",
-    "    for k, prog in enumerate(split_options):\n",
-    "        if (not transpose):\n",
-    "            plt.figure(figsize=(6*len(orders),5))\n",
-    "        if (transpose):\n",
-    "            plt.subplot(1,len(split_options),k+1)\n",
-    "        for i, order in enumerate(orders):\n",
-    "            if (not transpose):\n",
-    "                plt.subplot(1,len(orders),i+1)\n",
-    "            # Get the series for this ordering\n",
-    "            if (prog == 'all'):\n",
-    "                set = mydf.iloc[:,q_number[order]]\n",
-    "                n_sample = mydf.iloc[:,q_number[split_by]].notna().sum()\n",
-    "            else:\n",
-    "                set = mydf.iloc[:,q_number[order]][df.iloc[:,q_number[split_by]] == prog]\n",
-    "                n_sample = set.shape[0]\n",
-    "            # Split into a series for each option\n",
-    "            set = set.str.split(';',expand=True)\n",
-    "            options = set.iloc[0,:]\n",
-    "            # Drop the empty one (after the last semi colon)\n",
-    "            options.drop(options.index[options==''], inplace=True)\n",
-    "            # Sort so that we always have the same order\n",
-    "            options.sort_index(inplace=True)\n",
-    "\n",
-    "            for j, opt in enumerate(options):\n",
-    "                set[opt] = 0\n",
-    "                dummy = np.zeros(set.shape[0], dtype=np.int8)\n",
-    "                for m in range(len(options)):\n",
-    "                    this_rank = (set.iloc[:,m] == options[j])\n",
-    "                    dummy[this_rank] = m+1\n",
-    "                set.iloc[:, set.columns.get_loc(opt)] = dummy\n",
-    "\n",
-    "            for j, opt in enumerate(options):\n",
-    "                counts = ((set[options] == j+1).sum())/n_sample\n",
-    "                counts.sort_index(inplace=True, ascending=False)\n",
-    "                if (j == 0):\n",
-    "                    prev_counts = 0*counts\n",
-    "                plt.barh(counts.keys(), counts.values, left = prev_counts.values, label=\"%i\"%(j+1))\n",
-    "                prev_counts += counts\n",
-    "            plt.legend(bbox_to_anchor=(1, 1), loc='upper left', title='Rank', labelspacing=0.3)\n",
-    "            plt.xlim(0,1)\n",
-    "            plt.xlabel('fraction of respondents')\n",
-    "            plt.tight_layout()\n",
-    "            plt.title('%s: %s'%(q_name[split_by],prog))\n",
-    "    \n",
-    "df, q_number, q_name = combine_cols(df, 'year_MEE', 'year_MEE2', 'year_in_MEE_total', \n",
-    "                                    q_number, q_name, 'Year in MEE')\n",
-    "df, q_number, q_name = combine_cols(df, 'attract_name_MEE', 'attract_name_MEE2', 'attract_name_MEE_total', \n",
-    "                                    q_number, q_name, 'Attractive in name MEE')\n",
-    "df, q_number, q_name = combine_cols(df, 'miss_name_BSW', 'miss_name_BSW2', 'miss_name_BSW_total', \n",
-    "                                    q_number, q_name, 'Missed in name BSW')\n",
-    "df, q_number, q_name = combine_cols(df, 'miss_name_MEE', 'miss_name_MEE2', 'miss_name_MEE_total', \n",
-    "                                    q_number, q_name, 'Missed in name MEE')\n"
+    "df = pd.read_excel(fname)"
    ]
   },
   {
diff --git a/survey-helpers.py b/survey-helpers.py
new file mode 100644
index 0000000..4a9c26c
--- /dev/null
+++ b/survey-helpers.py
@@ -0,0 +1,194 @@
+q_number = {'curr_prog': 7,
+ 'did_BSW': 10,
+ 'attract_name_BSc': 13,
+ 'year_BSW': 16,
+ 'year_MEE': 19,
+ 'year_MEE2': 22,
+ 'attract_name_BSW': 28,
+ 'attract_name_MEE': 31,
+ 'attract_name_MEE2': 34,
+ 'miss_name_BSW': 37,
+ 'miss_name_BSW2': 40,
+ 'miss_name_MEE': 43,
+ 'miss_name_MEE2': 46,
+ 'public_words': 49,
+ 'recgnize_unique': 54,
+ 'broad_specific': 59,
+ 'order_synonym': 62,
+ 'order_environment': 65,
+ 'order_earth': 68,
+ 'which_order': 71,
+ 'prefix': 74,
+ 'comment_name': 77,
+ 'comment_survey': 80}
+q_name = {'curr_prog': 'Programme',
+ 'did_BSW': 'Studied BSW before MEE',
+ 'attract_name_BSc': 'What attracted you in the name of your (non- BSW) BSc',
+ 'year_BSW': 'Year in BSW',
+ 'year_MEE': 'Year in MEE 1',
+ 'year_MEE2': 'Year in MEE 2',
+ 'attract_name_BSW': 'What attracted you in the name BSW',
+ 'attract_name_MEE': 'What attracted you in the name BSW 1',
+ 'attract_name_MEE2': 'What attracted you in the name BSW 2',
+ 'miss_name_BSW': 'What did you miss in the name BSW 1',
+ 'miss_name_BSW2': 'What did you miss in the name BSW 2',
+ 'miss_name_MEE': 'What did you miss in the name MEE 1',
+ 'miss_name_MEE2': 'What did you miss in the name MEE 2',
+ 'public_words': 'Which words should be included',
+ 'recgnize_unique': 'Recognizable or unique',
+ 'broad_specific': 'Broad or specific',
+ 'order_synonym': 'Order of synonyms',
+ 'order_environment': 'Order of names with environment',
+ 'order_earth': 'Order of names with Earth',
+ 'which_order': 'Which order preferred',
+ 'prefix': 'Which prefix',
+ 'comment_name': 'Comment programme name',
+ 'comment_survey': 'Comment survey'}
+
+# Combine some columns
+def combine_cols(mydf, ser1_name, ser2_name, sernew_name, my_qnumber, my_qname, description):
+    # Add new column
+    mydf[sernew_name] = np.NaN
+    # Update info dictionaries
+    ncols = len(mydf.columns) - 1 # series was added to last column
+    q_number.update({sernew_name: ncols})
+    q_name.update({sernew_name: description})
+    
+    ser1 = mydf.iloc[:,q_number[ser1_name]]
+    ser2 = mydf.iloc[:,q_number[ser2_name]]
+    ser1d = ser1[ser1.notna()]
+    ser2d = ser2[ser2.notna()]
+    for i in ser1d.index.values:
+        mydf.iloc[i, q_number[sernew_name]] = ser1[i]
+    for i in ser2d.index.values:
+        mydf.iloc[i, q_number[sernew_name]] = ser2[i]
+    return mydf, my_qnumber, my_qname
+
+def plot_pies(vars):
+    fig, ax = plt.subplots(1, len(vars))
+    for i, var in enumerate(vars):
+        num_sample = df.iloc[:,q_number[var]].count()
+        counts = df.iloc[:,q_number[var]].value_counts(sort=True)
+        counts.sort_index(inplace=True, ascending=False)
+        counts.plot(kind="pie", ax=ax[i], label="", figsize=(15,15))
+        ax[i].set_title(q_name[var]+" (N=%i)"%(num_sample))
+    
+def plot_bar(mydf, split_by, vars, pref_middle=None, include_all = True):
+    split_options = mydf.iloc[:,q_number[split_by]]
+    counts = split_options.value_counts()
+    split_options = counts.keys().tolist()
+    if (include_all):
+        split_options.insert(0,'all')     
+
+    fig, ax = plt.subplots(1, len(vars),figsize=(5*len(vars),5))
+    if (len(vars) == 1):
+        ax=[ax]
+    width = 1/(len(split_options)+1)
+    for i, var in enumerate(vars):
+        for j, prog in enumerate(split_options):
+            if (prog == 'all'):
+                series = mydf.iloc[:,q_number[var]]
+                n_sample = mydf.iloc[:,q_number[split_by]].notna().sum()
+            else:
+                series = mydf.iloc[:,q_number[var]][df.iloc[:,q_number[split_by]] == prog]
+                n_sample = len(series)
+
+            counts = series.value_counts()
+            if (prog == 'all'):
+                counts.sort_index(inplace=True)
+                all_keys = counts.keys().tolist()
+                if (pref_middle):
+                    # Where is the middle one now ?
+                    if (len(all_keys) % 2 == 0):
+                        print('There is no middle in a list with even number of items')
+                    if (pref_middle in all_keys):
+                        cur_loc = all_keys.index(pref_middle)
+                        mid_loc = int(len(all_keys)/2-0.5)
+                        tmp = all_keys[mid_loc]
+                        all_keys[mid_loc] = pref_middle
+                        all_keys[cur_loc] = tmp
+                    else:
+                        print('Requested %s not in keys'%(pref_middle))
+                    mylist = ['a', 'b', 'c', 'd', 'e']
+                    myorder = [3, 2, 0, 1, 4]
+                    mylist = [mylist[i] for i in myorder]
+                x = np.arange(len(all_keys))
+                y = np.zeros(len(all_keys))
+            y = y*0
+            for k, key in enumerate(all_keys):
+                if (key in counts.keys()):
+                    y[k] = counts[key]
+            n_sample = y.sum()
+            ax[i].barh(x + width - j*width, y/n_sample, width, label = prog+' (N=%i)'%(n_sample))
+            if (j == len(split_options)-1):
+                ax[i].set_yticks(x)
+                ax[i].set_yticklabels(all_keys)
+                ax[i].set_xlabel('frequency')
+
+    ax[i].legend(bbox_to_anchor=(1, 1), loc='upper left', title=q_name[split_by])
+    if (len(vars) > 1):
+        fig.tight_layout()
+        
+def plot_ranking(mydf, split_by, orders, include_all = True, transpose=False):
+    if (transpose and (len(orders)> 1)):
+        print("Can not use transpose when more than one ordering should be plotted")
+        return
+    split_options = mydf.iloc[:,q_number[split_by]]
+    counts = split_options.value_counts()
+    split_options = counts.keys().tolist()
+    if (include_all):
+        split_options.insert(0,'all')     
+    if (transpose):
+            plt.figure(figsize=(6*len(split_options),5))
+    for k, prog in enumerate(split_options):
+        if (not transpose):
+            plt.figure(figsize=(6*len(orders),5))
+        if (transpose):
+            plt.subplot(1,len(split_options),k+1)
+        for i, order in enumerate(orders):
+            if (not transpose):
+                plt.subplot(1,len(orders),i+1)
+            # Get the series for this ordering
+            if (prog == 'all'):
+                set = mydf.iloc[:,q_number[order]]
+                n_sample = mydf.iloc[:,q_number[split_by]].notna().sum()
+            else:
+                set = mydf.iloc[:,q_number[order]][df.iloc[:,q_number[split_by]] == prog]
+                n_sample = set.shape[0]
+            # Split into a series for each option
+            set = set.str.split(';',expand=True)
+            options = set.iloc[0,:]
+            # Drop the empty one (after the last semi colon)
+            options.drop(options.index[options==''], inplace=True)
+            # Sort so that we always have the same order
+            options.sort_index(inplace=True)
+
+            for j, opt in enumerate(options):
+                set[opt] = 0
+                dummy = np.zeros(set.shape[0], dtype=np.int8)
+                for m in range(len(options)):
+                    this_rank = (set.iloc[:,m] == options[j])
+                    dummy[this_rank] = m+1
+                set.iloc[:, set.columns.get_loc(opt)] = dummy
+
+            for j, opt in enumerate(options):
+                counts = ((set[options] == j+1).sum())/n_sample
+                counts.sort_index(inplace=True, ascending=False)
+                if (j == 0):
+                    prev_counts = 0*counts
+                plt.barh(counts.keys(), counts.values, left = prev_counts.values, label="%i"%(j+1))
+                prev_counts += counts
+            plt.legend(bbox_to_anchor=(1, 1), loc='upper left', title='Rank', labelspacing=0.3)
+            plt.xlim(0,1)
+            plt.xlabel('fraction of respondents')
+            plt.tight_layout()
+            plt.title('%s: %s'%(q_name[split_by],prog))
+    
+df, q_number, q_name = combine_cols(df, 'year_MEE', 'year_MEE2', 'year_in_MEE_total', 
+                                    q_number, q_name, 'Year in MEE')
+df, q_number, q_name = combine_cols(df, 'attract_name_MEE', 'attract_name_MEE2', 'attract_name_MEE_total', 
+                                    q_number, q_name, 'Attractive in name MEE')
+df, q_number, q_name = combine_cols(df, 'miss_name_BSW', 'miss_name_BSW2', 'miss_name_BSW_total', 
+                                    q_number, q_name, 'Missed in name BSW')
+df, q_number, q_name = combine_cols(df, 'miss_name_MEE', 'miss_name_MEE2', 'miss_name_MEE_total', 
+                                    q_number, q_name, 'Missed in name MEE')
-- 
GitLab