{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "b9df5df0-154a-4802-80f8-101566dc579d", "metadata": { "tags": [ "remove-cell" ] }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "mtcars = pd.read_csv('https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv')" ] }, { "cell_type": "markdown", "id": "f7ef09fe-3841-46ea-900e-ef82a5663a8a", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "## Boolean operations with Pandas\n", "Like NumPy, you can subset and select values from a DataFrame using Boolean values - they work just the same as in NumPy by comparing values in your DataFrame to your chosen quantity." ] }, { "cell_type": "code", "execution_count": 2, "id": "34c59c95-2db6-4ae1-9b0a-97d64d6a8c44", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelmpgcyldisphpdratwtqsecvsamgearcarb
0Mazda RX421.06160.01103.902.62016.460144
1Mazda RX4 Wag21.06160.01103.902.87517.020144
2Datsun 71022.84108.0933.852.32018.611141
17Fiat 12832.4478.7664.082.20019.471141
18Honda Civic30.4475.7524.931.61518.521142
\n", "
" ], "text/plain": [ " model mpg cyl disp hp drat wt qsec vs am gear \\\n", "0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", "1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", "2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", "17 Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 \n", "18 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 \n", "\n", " carb \n", "0 4 \n", "1 4 \n", "2 1 \n", "17 1 \n", "18 2 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelmpgcyldisphpdratwtqsecvsamgearcarb
3Hornet 4 Drive21.46258.01103.083.21519.441031
4Hornet Sportabout18.78360.01753.153.44017.020032
5Valiant18.16225.01052.763.46020.221031
6Duster 36014.38360.02453.213.57015.840034
7Merc 240D24.44146.7623.693.19020.001042
\n", "
" ], "text/plain": [ " model mpg cyl disp hp drat wt qsec vs am gear \\\n", "3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", "4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", "5 Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 \n", "6 Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 \n", "7 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 \n", "\n", " carb \n", "3 1 \n", "4 2 \n", "5 1 \n", "6 4 \n", "7 2 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Demonstrate Boolean subsets. Split the data into two - 'am' specifies whether a car is automatic or manual\n", "auto = mtcars['am'] == 1\n", "\n", "display(mtcars[auto].head())\n", "display(mtcars[~auto].head())" ] }, { "cell_type": "code", "execution_count": 3, "id": "1787c4a6-ecd2-47b7-b2cf-fb519ffc80b3", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelmpgcyldisphpdratwtqsecvsamgearcarb
28Ford Pantera L15.88351.02644.223.1714.50154
30Maserati Bora15.08301.03353.543.5714.60158
\n", "
" ], "text/plain": [ " model mpg cyl disp hp drat wt qsec vs am gear \\\n", "28 Ford Pantera L 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 \n", "30 Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 \n", "\n", " carb \n", "28 4 \n", "30 8 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Or MPG between certain values, only for cars with 5 gears!\n", "miles_per = ((mtcars['mpg'] < 17.0) | (mtcars['mpg'] > 32.0)) & (mtcars['gear'] == 5)\n", "\n", "display(mtcars[miles_per])" ] }, { "cell_type": "markdown", "id": "b52bca18-da44-4255-aa96-39f7eecdb7dd", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### Cleaning data in Pandas\n", "As with NumPy arrays, there are special ways to remove data with NaN's. Pandas has its own `.isnull()` method which can be chained with other methods to help clean up a dataset." ] }, { "cell_type": "code", "execution_count": 4, "id": "a799da32-edaa-47df-9b6d-5c4f40fd77c6", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelmpgcyldisphpdratwtqsecvsamgearcarb
0Mazda RX421.06.0160.0110.03.90NaNNaN0.01.04.04.0
1Mazda RX4 Wag21.06.0160.0110.03.902.87517.020.01.04.0NaN
2NaN22.84.0108.093.03.852.32018.61NaN1.04.01.0
3Hornet 4 Drive21.46.0258.0NaN3.083.21519.441.0NaN3.01.0
4Hornet Sportabout18.7NaN360.0175.03.153.44017.020.00.03.0NaN
\n", "
" ], "text/plain": [ " model mpg cyl disp hp drat wt qsec vs am \\\n", "0 Mazda RX4 21.0 6.0 160.0 110.0 3.90 NaN NaN 0.0 1.0 \n", "1 Mazda RX4 Wag 21.0 6.0 160.0 110.0 3.90 2.875 17.02 0.0 1.0 \n", "2 NaN 22.8 4.0 108.0 93.0 3.85 2.320 18.61 NaN 1.0 \n", "3 Hornet 4 Drive 21.4 6.0 258.0 NaN 3.08 3.215 19.44 1.0 NaN \n", "4 Hornet Sportabout 18.7 NaN 360.0 175.0 3.15 3.440 17.02 0.0 0.0 \n", "\n", " gear carb \n", "0 4.0 4.0 \n", "1 4.0 NaN \n", "2 4.0 1.0 \n", "3 3.0 1.0 \n", "4 3.0 NaN " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# First add NaN to rows\n", "inds = np.random.choice([True, False], p=[.1, .9], size=mtcars.shape)\n", "\n", "# Copy our original array so we don't overwrite it!\n", "nan_cars = mtcars.copy()\n", "\n", "# Add NaN\n", "nan_cars[inds] = np.NaN\n", "\n", "display(nan_cars.head())" ] }, { "cell_type": "code", "execution_count": 5, "id": "e7bc45ba-6c7a-4c69-899b-8d7557e58586", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelmpgcyldisphpdratwtqsecvsamgearcarb
6Duster 36014.38.0360.0245.03.213.5715.840.00.03.04.0
13Merc 450SLC15.28.0275.8180.03.073.7818.000.00.03.03.0
17Fiat 12832.44.078.766.04.082.2019.471.01.04.01.0
21Dodge Challenger15.58.0318.0150.02.763.5216.870.00.03.02.0
31Volvo 142E21.44.0121.0109.04.112.7818.601.01.04.02.0
\n", "
" ], "text/plain": [ " model mpg cyl disp hp drat wt qsec vs am \\\n", "6 Duster 360 14.3 8.0 360.0 245.0 3.21 3.57 15.84 0.0 0.0 \n", "13 Merc 450SLC 15.2 8.0 275.8 180.0 3.07 3.78 18.00 0.0 0.0 \n", "17 Fiat 128 32.4 4.0 78.7 66.0 4.08 2.20 19.47 1.0 1.0 \n", "21 Dodge Challenger 15.5 8.0 318.0 150.0 2.76 3.52 16.87 0.0 0.0 \n", "31 Volvo 142E 21.4 4.0 121.0 109.0 4.11 2.78 18.60 1.0 1.0 \n", "\n", " gear carb \n", "6 3.0 4.0 \n", "13 3.0 3.0 \n", "17 4.0 1.0 \n", "21 3.0 2.0 \n", "31 4.0 2.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Use .isnull() to find problem participants with .any()\n", "problems = nan_cars.isnull().any(axis=1)\n", "\n", "# Subset!\n", "display(nan_cars[~problems])" ] }, { "cell_type": "markdown", "id": "d31cf973-a55e-4b79-949c-274077732c71", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### Imputing values and summary statistics\n", "Dropping data is great, but most of the time you don't want to do that. Psychological data can be hard to come by and you want to make the most of it. There is a solution - replacing missing values with some educated guesses. An example of this might be the mean or median.\n", "\n", "In NumPy, computing summary statistics across vectors with missing data (`np.NaN`) will always result in `nan` - the missing value will cause problems. But in Pandas, calling `.mean()`, `.median()`, or `.std()` will *ignore* missing values and compute the statistic on the valid data. \n", "\n", "And once you have those, you can replace the missing values with them. This is made very simple with Pandas, because of the `.fillna()` method. This method accepts either a constant value, or a vector of scores that match the axis of interest (e.g., same number of rows or columns) to fill the gaps with." ] }, { "cell_type": "code", "execution_count": 6, "id": "ca0c89ef-5c9e-4e5d-846d-30f4295186a8", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelmpgcyldisphpdratwtqsecvsamgearcarb
27Lotus Europa9999.09999.095.1113.03.771.51316.91.01.05.02.0
28Ford Pantera L15.88.0351.09999.04.223.17014.50.01.05.04.0
29Ferrari Dino19.76.0145.0175.03.622.7709999.00.09999.05.06.0
30Maserati Bora9999.08.0301.0335.03.543.57014.60.01.09999.08.0
31Volvo 142E21.44.0121.0109.04.112.78018.61.01.04.02.0
\n", "
" ], "text/plain": [ " model mpg cyl disp hp drat wt qsec vs \\\n", "27 Lotus Europa 9999.0 9999.0 95.1 113.0 3.77 1.513 16.9 1.0 \n", "28 Ford Pantera L 15.8 8.0 351.0 9999.0 4.22 3.170 14.5 0.0 \n", "29 Ferrari Dino 19.7 6.0 145.0 175.0 3.62 2.770 9999.0 0.0 \n", "30 Maserati Bora 9999.0 8.0 301.0 335.0 3.54 3.570 14.6 0.0 \n", "31 Volvo 142E 21.4 4.0 121.0 109.0 4.11 2.780 18.6 1.0 \n", "\n", " am gear carb \n", "27 1.0 5.0 2.0 \n", "28 1.0 5.0 4.0 \n", "29 9999.0 5.0 6.0 \n", "30 1.0 9999.0 8.0 \n", "31 1.0 4.0 2.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Demonstrate fillna with a constant\n", "display(nan_cars.fillna(9999).tail())" ] }, { "cell_type": "code", "execution_count": 7, "id": "274c2736-25b1-45d3-9f8f-5c01b58d423e", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/mw/xt4ddf0j2n3dr4qcr__qhqlr0000gn/T/ipykernel_37590/463963669.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.\n", " mean_scores = nan_cars.mean(axis=0) # altrenatively use `axis='rows'`\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelmpgcyldisphpdratwtqsecvsamgearcarb
27Lotus Europa19.6785716.14814895.1113.0000003.771.51316.901.01.005.0000002.0
28Ford Pantera L15.8000008.000000351.0141.8965524.223.17014.500.01.005.0000004.0
29Ferrari Dino19.7000006.000000145.0175.0000003.622.77017.740.00.445.0000006.0
30Maserati Bora19.6785718.000000301.0335.0000003.543.57014.600.01.003.6071438.0
31Volvo 142E21.4000004.000000121.0109.0000004.112.78018.601.01.004.0000002.0
\n", "
" ], "text/plain": [ " model mpg cyl disp hp drat wt \\\n", "27 Lotus Europa 19.678571 6.148148 95.1 113.000000 3.77 1.513 \n", "28 Ford Pantera L 15.800000 8.000000 351.0 141.896552 4.22 3.170 \n", "29 Ferrari Dino 19.700000 6.000000 145.0 175.000000 3.62 2.770 \n", "30 Maserati Bora 19.678571 8.000000 301.0 335.000000 3.54 3.570 \n", "31 Volvo 142E 21.400000 4.000000 121.0 109.000000 4.11 2.780 \n", "\n", " qsec vs am gear carb \n", "27 16.90 1.0 1.00 5.000000 2.0 \n", "28 14.50 0.0 1.00 5.000000 4.0 \n", "29 17.74 0.0 0.44 5.000000 6.0 \n", "30 14.60 0.0 1.00 3.607143 8.0 \n", "31 18.60 1.0 1.00 4.000000 2.0 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "mpg 19.678571\n", "cyl 6.148148\n", "disp 230.721875\n", "hp 141.896552\n", "drat 3.593226\n", "wt 3.189536\n", "qsec 17.740000\n", "vs 0.413793\n", "am 0.440000\n", "gear 3.607143\n", "carb 2.807692\n", "dtype: float64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Compute means PER VARIABLE (across axis 0)\n", "mean_scores = nan_cars.mean(axis=0) # altrenatively use `axis='rows'`\n", "\n", "# Now fill\n", "fixed = nan_cars.fillna(mean_scores)\n", "\n", "display(fixed.tail())\n", "display(mean_scores.T)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" } }, "nbformat": 4, "nbformat_minor": 5 }