{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Creating Columns as Functions of Existing Columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting ready\n", "\n", "Import `pandas` and `numpy` and create a `DataFrame` to work with" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
2020-01-010.683402-0.5309332.211594
2020-01-020.4922850.4224950.047873
2020-01-03-0.287670-1.7125410.665600
2020-01-04-1.206087-0.4797460.112255
2020-01-050.3842780.314534-1.529952
\n", "
" ], "text/plain": [ " A B C\n", "2020-01-01 0.683402 -0.530933 2.211594\n", "2020-01-02 0.492285 0.422495 0.047873\n", "2020-01-03 -0.287670 -1.712541 0.665600\n", "2020-01-04 -1.206087 -0.479746 0.112255\n", "2020-01-05 0.384278 0.314534 -1.529952" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.date_range('1/1/2020', periods=8)\n", "data = np.random.randn(8, 3)\n", "df = pd.DataFrame(data,\n", " index=dates, columns=['A', 'B', 'C'])\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How to do it\n", "\n", "### Using vectorized operations\n", "\n", "1. Create a new column as a function of an existing column" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df['Squared_A'] = df['A']**2" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df['CosA'] = np.cos(df['A'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2. Create a new column as a function of two or more existing columns using vectorized operation such as product and sum" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCSquared_ACosASum_ABProd_AB
2020-01-010.683402-0.5309332.2115940.4670380.7754290.152469-0.362841
2020-01-020.4922850.4224950.0478730.2423440.8812550.9147790.207988
2020-01-03-0.287670-1.7125410.6656000.0827540.958908-2.0002100.492646
2020-01-04-1.206087-0.4797460.1122551.4546450.356678-1.6858320.578615
2020-01-050.3842780.314534-1.5299520.1476690.9270700.6988110.120868
2020-01-061.0646570.5914280.2505441.1334940.4848051.6560850.629668
2020-01-07-0.459051-0.2873551.9717430.2107280.896473-0.7464060.131911
2020-01-080.992138-0.5124990.7116910.9843380.5469010.479639-0.508470
\n", "
" ], "text/plain": [ " A B C Squared_A CosA Sum_AB \\\n", "2020-01-01 0.683402 -0.530933 2.211594 0.467038 0.775429 0.152469 \n", "2020-01-02 0.492285 0.422495 0.047873 0.242344 0.881255 0.914779 \n", "2020-01-03 -0.287670 -1.712541 0.665600 0.082754 0.958908 -2.000210 \n", "2020-01-04 -1.206087 -0.479746 0.112255 1.454645 0.356678 -1.685832 \n", "2020-01-05 0.384278 0.314534 -1.529952 0.147669 0.927070 0.698811 \n", "2020-01-06 1.064657 0.591428 0.250544 1.133494 0.484805 1.656085 \n", "2020-01-07 -0.459051 -0.287355 1.971743 0.210728 0.896473 -0.746406 \n", "2020-01-08 0.992138 -0.512499 0.711691 0.984338 0.546901 0.479639 \n", "\n", " Prod_AB \n", "2020-01-01 -0.362841 \n", "2020-01-02 0.207988 \n", "2020-01-03 0.492646 \n", "2020-01-04 0.578615 \n", "2020-01-05 0.120868 \n", "2020-01-06 0.629668 \n", "2020-01-07 0.131911 \n", "2020-01-08 -0.508470 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Sum_AB'] = df['A'] + df['B']\n", "df['Prod_AB'] = df['A'] * df['B']\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### Using `apply`\n", "\n", "\n", "3. Create a new column evaluating an ad-hoc function on existing columns. To do this we need to \n", "\n", "- Define the function that we want to evaluate\n", "- Use the method `apply` combined with a `lambda` expression" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCSquared_ACosASum_ABProd_ABfunction_ABC
2020-01-010.683402-0.5309332.2115940.4670380.7754290.152469-0.3628411.039929
2020-01-020.4922850.4224950.0478730.2423440.8812550.9147790.2079882.792179
2020-01-03-0.287670-1.7125410.6656000.0827540.958908-2.0002100.4926461.604184
2020-01-04-1.206087-0.4797460.1122551.4546450.356678-1.6858320.5786152.237496
2020-01-050.3842780.314534-1.5299520.1476690.9270700.6988110.1208681.918843
2020-01-061.0646570.5914280.2505441.1334940.4848051.6560850.6296682.283728
2020-01-07-0.459051-0.2873551.9717430.2107280.896473-0.7464060.1319111.465180
2020-01-080.992138-0.5124990.7116910.9843380.5469010.479639-0.5084702.175681
\n", "
" ], "text/plain": [ " A B C Squared_A CosA Sum_AB \\\n", "2020-01-01 0.683402 -0.530933 2.211594 0.467038 0.775429 0.152469 \n", "2020-01-02 0.492285 0.422495 0.047873 0.242344 0.881255 0.914779 \n", "2020-01-03 -0.287670 -1.712541 0.665600 0.082754 0.958908 -2.000210 \n", "2020-01-04 -1.206087 -0.479746 0.112255 1.454645 0.356678 -1.685832 \n", "2020-01-05 0.384278 0.314534 -1.529952 0.147669 0.927070 0.698811 \n", "2020-01-06 1.064657 0.591428 0.250544 1.133494 0.484805 1.656085 \n", "2020-01-07 -0.459051 -0.287355 1.971743 0.210728 0.896473 -0.746406 \n", "2020-01-08 0.992138 -0.512499 0.711691 0.984338 0.546901 0.479639 \n", "\n", " Prod_AB function_ABC \n", "2020-01-01 -0.362841 1.039929 \n", "2020-01-02 0.207988 2.792179 \n", "2020-01-03 0.492646 1.604184 \n", "2020-01-04 0.578615 2.237496 \n", "2020-01-05 0.120868 1.918843 \n", "2020-01-06 0.629668 2.283728 \n", "2020-01-07 0.131911 1.465180 \n", "2020-01-08 -0.508470 2.175681 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from math import sin, cos\n", "\n", "def my_function(x, y, z):\n", " if x + y >3:\n", " return sin(x) + sin(y) + sin(z)\n", " else:\n", " return cos(x) + cos(y) + cos(z)\n", "\n", "\n", "df['function_ABC'] = df.apply(lambda x: my_function(x['A'], x['B'], x['C']), axis=1)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "4. Create a column which maps the values of a column to a set of colors depending on some boundaries. \n", "For example, we are going to map column `C` as follows:\n", "\n", "- 'black' for values < -3\n", "- 'yellow' for values in [-3, 3]\n", "- 'green' for values >3\n", " " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCSquared_ASum_ABProd_ABfunction_ABCcolor
2020-01-010.7089741.6538390.4603110.5026442.3628131.1725292.091836yellow
2020-01-02-0.2446300.0636130.3094030.059844-0.181016-0.0155620.125863yellow
2020-01-030.3238680.4387680.5700300.1048910.7626370.1421031.282718yellow
2020-01-04-0.440328-0.2766951.0789060.193889-0.7170240.1218370.182027yellow
2020-01-050.5042050.6004841.3128370.2542231.1046890.3027672.015066yellow
2020-01-06-1.1486531.1194761.3745911.319403-0.029177-1.2858890.968473yellow
2020-01-07-1.795673-1.0714520.6957523.224442-2.8671251.923978-1.211755yellow
2020-01-08-1.455186-0.705272-0.0828832.117567-2.1604591.026303-1.724354yellow
\n", "
" ], "text/plain": [ " A B C Squared_A Sum_AB Prod_AB \\\n", "2020-01-01 0.708974 1.653839 0.460311 0.502644 2.362813 1.172529 \n", "2020-01-02 -0.244630 0.063613 0.309403 0.059844 -0.181016 -0.015562 \n", "2020-01-03 0.323868 0.438768 0.570030 0.104891 0.762637 0.142103 \n", "2020-01-04 -0.440328 -0.276695 1.078906 0.193889 -0.717024 0.121837 \n", "2020-01-05 0.504205 0.600484 1.312837 0.254223 1.104689 0.302767 \n", "2020-01-06 -1.148653 1.119476 1.374591 1.319403 -0.029177 -1.285889 \n", "2020-01-07 -1.795673 -1.071452 0.695752 3.224442 -2.867125 1.923978 \n", "2020-01-08 -1.455186 -0.705272 -0.082883 2.117567 -2.160459 1.026303 \n", "\n", " function_ABC color \n", "2020-01-01 2.091836 yellow \n", "2020-01-02 0.125863 yellow \n", "2020-01-03 1.282718 yellow \n", "2020-01-04 0.182027 yellow \n", "2020-01-05 2.015066 yellow \n", "2020-01-06 0.968473 yellow \n", "2020-01-07 -1.211755 yellow \n", "2020-01-08 -1.724354 yellow " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def color_function(x):\n", " if x < -3:\n", " return 'black'\n", " elif x > 3:\n", " return 'green'\n", " else: \n", " return 'yellow'\n", "\n", "\n", "df['color'] = df.apply(lambda x: color_function(x['C']), axis=1)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using conditions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "5. Create a column using a condition. For example, checking of the value of a column is positive or negative." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCSquared_ASum_ABProd_ABfunction_ABCcolorFlag
2020-01-010.7089741.6538390.4603110.5026442.3628131.1725292.091836yellowTrue
2020-01-02-0.2446300.0636130.3094030.059844-0.181016-0.0155620.125863yellowFalse
2020-01-030.3238680.4387680.5700300.1048910.7626370.1421031.282718yellowTrue
2020-01-04-0.440328-0.2766951.0789060.193889-0.7170240.1218370.182027yellowFalse
2020-01-050.5042050.6004841.3128370.2542231.1046890.3027672.015066yellowTrue
2020-01-06-1.1486531.1194761.3745911.319403-0.029177-1.2858890.968473yellowFalse
2020-01-07-1.795673-1.0714520.6957523.224442-2.8671251.923978-1.211755yellowFalse
2020-01-08-1.455186-0.705272-0.0828832.117567-2.1604591.026303-1.724354yellowFalse
\n", "
" ], "text/plain": [ " A B C Squared_A Sum_AB Prod_AB \\\n", "2020-01-01 0.708974 1.653839 0.460311 0.502644 2.362813 1.172529 \n", "2020-01-02 -0.244630 0.063613 0.309403 0.059844 -0.181016 -0.015562 \n", "2020-01-03 0.323868 0.438768 0.570030 0.104891 0.762637 0.142103 \n", "2020-01-04 -0.440328 -0.276695 1.078906 0.193889 -0.717024 0.121837 \n", "2020-01-05 0.504205 0.600484 1.312837 0.254223 1.104689 0.302767 \n", "2020-01-06 -1.148653 1.119476 1.374591 1.319403 -0.029177 -1.285889 \n", "2020-01-07 -1.795673 -1.071452 0.695752 3.224442 -2.867125 1.923978 \n", "2020-01-08 -1.455186 -0.705272 -0.082883 2.117567 -2.160459 1.026303 \n", "\n", " function_ABC color Flag \n", "2020-01-01 2.091836 yellow True \n", "2020-01-02 0.125863 yellow False \n", "2020-01-03 1.282718 yellow True \n", "2020-01-04 0.182027 yellow False \n", "2020-01-05 2.015066 yellow True \n", "2020-01-06 0.968473 yellow False \n", "2020-01-07 -1.211755 yellow False \n", "2020-01-08 -1.724354 yellow False " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Flag'] = df['A'] >= 0 \n", "df " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using `map`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "6. Map a column with boolean values to color names in string format. For example, map `True` to color 'green' and `False` to color 'red'." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCSquared_ASum_ABProd_ABfunction_ABCcolorFlagFlag_to_Colors
2020-01-010.7089741.6538390.4603110.5026442.3628131.1725292.091836yellowTruegreen
2020-01-02-0.2446300.0636130.3094030.059844-0.181016-0.0155620.125863yellowFalsered
2020-01-030.3238680.4387680.5700300.1048910.7626370.1421031.282718yellowTruegreen
2020-01-04-0.440328-0.2766951.0789060.193889-0.7170240.1218370.182027yellowFalsered
2020-01-050.5042050.6004841.3128370.2542231.1046890.3027672.015066yellowTruegreen
2020-01-06-1.1486531.1194761.3745911.319403-0.029177-1.2858890.968473yellowFalsered
2020-01-07-1.795673-1.0714520.6957523.224442-2.8671251.923978-1.211755yellowFalsered
2020-01-08-1.455186-0.705272-0.0828832.117567-2.1604591.026303-1.724354yellowFalsered
\n", "
" ], "text/plain": [ " A B C Squared_A Sum_AB Prod_AB \\\n", "2020-01-01 0.708974 1.653839 0.460311 0.502644 2.362813 1.172529 \n", "2020-01-02 -0.244630 0.063613 0.309403 0.059844 -0.181016 -0.015562 \n", "2020-01-03 0.323868 0.438768 0.570030 0.104891 0.762637 0.142103 \n", "2020-01-04 -0.440328 -0.276695 1.078906 0.193889 -0.717024 0.121837 \n", "2020-01-05 0.504205 0.600484 1.312837 0.254223 1.104689 0.302767 \n", "2020-01-06 -1.148653 1.119476 1.374591 1.319403 -0.029177 -1.285889 \n", "2020-01-07 -1.795673 -1.071452 0.695752 3.224442 -2.867125 1.923978 \n", "2020-01-08 -1.455186 -0.705272 -0.082883 2.117567 -2.160459 1.026303 \n", "\n", " function_ABC color Flag Flag_to_Colors \n", "2020-01-01 2.091836 yellow True green \n", "2020-01-02 0.125863 yellow False red \n", "2020-01-03 1.282718 yellow True green \n", "2020-01-04 0.182027 yellow False red \n", "2020-01-05 2.015066 yellow True green \n", "2020-01-06 0.968473 yellow False red \n", "2020-01-07 -1.211755 yellow False red \n", "2020-01-08 -1.724354 yellow False red " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Flag_to_Colors'] = df['Flag'].map({True:'green', False:'red'}) \n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }