diff --git a/programme_name_analysis.ipynb b/programme_name_analysis.ipynb index 53582157a27cb02e9092f28f097f60962b21e5b1..68bd596e4f2dd97d63a639a1a30faff15b2eecc6 100644 --- a/programme_name_analysis.ipynb +++ b/programme_name_analysis.ipynb @@ -21,6 +21,13 @@ "%matplotlib inline" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read the data" + ] + }, { "cell_type": "code", "execution_count": null, @@ -30,10 +37,30 @@ "# Get the data from file\n", "fname='Survey name of programmes BSW & MEE(1-178).xlsx'\n", "df = pd.read_excel(fname)\n", - "df, q_number, q_name = extend_names(df, q_number, q_name, survey='students')\n", + "# Update/extend the data set by combining a number of columns\n", + "df, q_number, q_name = extend_names(df, q_number, q_name, survey='students')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To simplify the analysis we can access various columns in the dataset through a short-hand name. You can get the available names using `print(qname.keys())`.\n", "\n", + "For plotting we have three routines:\n", + "* `plot_pies(df, vars)`: plot one or more pie charts; the data plotted are indicated with the variable names in the list `vars`\n", + "* `plot_bar(df, vars, separate_by, include_all=True, pref_middle='neutral')`: plot one or more bar graphs; the variable(s) of which the bar graphs are plotted are given in list `vars`, the data are grouped by the variable `separate_by`, you can choose to also include an unseparated bar graph (`include_all=True`), and you can select a specific option to be the central one (using `pref_middle= ...`\n", + "* `plot_ranking(df, vars, separate_by, include_all=True)`: plot the relative frequency of a certain ranking for one or more variables given in list `vars`, the data are grouped by the variable `separate_by`, you can choose to also include an unseparated bar graph (`include_all=True`).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# To show the short-hand names for the various columns: in q_names\n", - "print(q_names.keys())" + "# print(q_name.keys())" ] }, { @@ -54,7 +81,7 @@ "outputs": [], "source": [ "vars = [\"curr_prog\",\"year_BSW\",\"year_in_MEE_total\", \"did_BSW\"] \n", - "plot_pies(vars)" + "plot_pies(df, vars)" ] }, { @@ -78,7 +105,7 @@ "# Which variables to look at \n", "vars = [\"recgnize_unique\",\"broad_specific\"]\n", "# Split data by current programma and include the total number (and put 'neutral' nicely in the middle)\n", - "plot_bar(df, 'curr_prog', vars, pref_middle='neutral')" + "plot_bar(df, vars, 'curr_prog', pref_middle='neutral')" ] }, { @@ -102,7 +129,7 @@ "# Which rankings to look at \n", "orders = ['order_synonym', 'order_environment', 'order_earth']\n", "# Split data by current programma and include the total number\n", - "plot_ranking(df, 'curr_prog', orders, include_all=True)" + "plot_ranking(df, orders, 'curr_prog', include_all=True)" ] }, { @@ -123,7 +150,7 @@ "# Which rankings to look at \n", "orders = ['which_order']\n", "# Split data by current programma, include the total number and put the graphs next to each other\n", - "plot_ranking(df, 'curr_prog', orders, include_all=True, transpose=True)" + "plot_ranking(df, orders, 'curr_prog', include_all=True, transpose=True)" ] }, { @@ -143,7 +170,7 @@ "# Which variables to look at \n", "vars = [\"prefix\"] \n", "# Split data by current programma, include the total numbe\n", - "plot_bar(df, 'curr_prog', vars, include_all=True)" + "plot_bar(df, vars, 'curr_prog', include_all=True)" ] }, { diff --git a/survey_helpers.py b/survey_helpers.py index 52ba6ab9159137aa28804acf6c2dc1fab00721db..7ca0dd737ea621027bae9f7a814c1fc3ab9a3a8f 100644 --- a/survey_helpers.py +++ b/survey_helpers.py @@ -66,16 +66,16 @@ def combine_cols(mydf, ser1_name, ser2_name, sernew_name, my_qnumber, my_qname, mydf.iloc[i, q_number[sernew_name]] = ser2[i] return mydf, my_qnumber, my_qname -def plot_pies(vars): +def plot_pies(mydf, vars): fig, ax = plt.subplots(1, len(vars)) for i, var in enumerate(vars): - num_sample = df.iloc[:,q_number[var]].count() - counts = df.iloc[:,q_number[var]].value_counts(sort=True) + num_sample = mydf.iloc[:,q_number[var]].count() + counts = mydf.iloc[:,q_number[var]].value_counts(sort=True) counts.sort_index(inplace=True, ascending=False) counts.plot(kind="pie", ax=ax[i], label="", figsize=(15,15)) ax[i].set_title(q_name[var]+" (N=%i)"%(num_sample)) -def plot_bar(mydf, split_by, vars, pref_middle=None, include_all = True): +def plot_bar(mydf, vars, split_by, pref_middle=None, include_all = True): split_options = mydf.iloc[:,q_number[split_by]] counts = split_options.value_counts() split_options = counts.keys().tolist() @@ -131,7 +131,7 @@ def plot_bar(mydf, split_by, vars, pref_middle=None, include_all = True): if (len(vars) > 1): fig.tight_layout() -def plot_ranking(mydf, split_by, orders, include_all = True, transpose=False): +def plot_ranking(mydf, orders, split_by, include_all = True, transpose=False): if (transpose and (len(orders)> 1)): print("Can not use transpose when more than one ordering should be plotted") return