8.2.4 データのミューテーション
分析用のデータを準備する際の一般的な操作は、データの書式を再設定するか、新しい列を導出してデータセットに追加することによって、データをミューテーションすることです。
次の例では、データの書式設定および列の導出の方法を示します。
import pandas as pd
import oml
# Create a shopping cart data set.
shopping_cart = pd.DataFrame({
'Item_name': ['paper_towel', 'ground_pork', 'tofu', 'eggs',
'pork_loin', 'whole_milk', 'egg_custard'],
'Item_type': ['grocery', 'meat', 'grocery', 'dairy', 'meat',
'dairy', 'bakery'],
'Quantity': [1, 2.6, 4, 1, 1.9, 1, 1],
'Unit_price': [1.19, 2.79, 0.99, 2.49, 3.19, 2.5, 3.99]
})
oml_cart = oml.push(shopping_cart)
oml_cart
# Add a column 'Price' multiplying 'Quantity' with 'Unit_price',
# rounded to 2 decimal places.
price = oml_cart['Quantity']*(oml_cart['Unit_price'])
type(price)
price
oml_cart = oml_cart.concat({'Price': price.round(2)})
# Count the pattern 'egg' in the 'Item_name' column.
egg_pattern = oml_cart['Item_name'].count_pattern('egg')
type(egg_pattern)
oml_cart.concat({'Egg_pattern': egg_pattern})
# Find the start index of substring 'pork' in the 'Item_name' column.
pork_startInd = oml_cart['Item_name'].find('pork')
type(pork_startInd)
oml_cart.concat({'Pork_startInd': pork_startInd})
# Check whether items are of grocery category.
is_grocery=oml_cart['Item_type']=='grocery'
type(is_grocery)
oml_cart.concat({'Is_grocery': is_grocery})
# Calculate the length of item names.
name_length=oml_cart['Item_name'].len()
type(name_length)
oml_cart.concat({'Name_length': name_length})
# Get the ceiling, floor, exponential, logarithm and square root
# of the 'Price' column.
oml_cart['Price'].ceil()
oml_cart['Price'].floor()
oml_cart['Price'].exp()
oml_cart['Price'].log()
oml_cart['Price'].sqrt()
この例のリスト
>>> import pandas as pd
>>> import oml
>>>
>>> # Create a shopping cart data set.
... shopping_cart = pd.DataFrame({
... 'Item_name': ['paper_towel', 'ground_pork', 'tofu', 'eggs',
... 'pork_loin', 'whole_milk', 'egg_custard'],
... 'Item_type': ['grocery', 'meat', 'grocery', 'dairy', 'meat',
... 'dairy', 'bakery'],
... 'Quantity': [1, 2.6, 4, 1, 1.9, 1, 1],
... 'Unit_price': [1.19, 2.79, 0.99, 2.49, 3.19, 2.5, 3.99]
... })
>>> oml_cart = oml.push(shopping_cart)
>>> oml_cart
Item_name Item_type Quantity Unit_price
0 paper_towel grocery 1.0 1.19
1 ground_pork meat 2.6 2.79
2 tofu grocery 4.0 0.99
3 eggs dairy 1.0 2.49
4 pork_loin meat 1.9 3.19
5 whole_milk dairy 1.0 2.50
6 egg_custard bakery 1.0 3.99
>>>
>>> # Add a column 'Price' multiplying 'Quantity' with 'Unit_price',
... # rounded to 2 decimal places.
... price = oml_cart['Quantity']*(oml_cart['Unit_price'])
>>> type(price)
<class 'oml.core.float.Float'>
>>> price
[1.19, 7.254, 3.96, 2.49, 6.061, 2.5, 3.99]
>>> oml_cart = oml_cart.concat({'Price': price.round(2)})
>>>
>>> # Count the pattern 'egg' in the 'Item_name' column.
... egg_pattern = oml_cart['Item_name'].count_pattern('egg')
>>> type(egg_pattern)
<class 'oml.core.float.Float'>
>>> oml_cart.concat({'Egg_pattern': egg_pattern})
Item_name Item_type Quantity Unit_price Price Egg_pattern
0 paper_towel grocery 1.0 1.19 1.19 0
1 ground_pork meat 2.6 2.79 7.25 0
2 tofu grocery 4.0 0.99 3.96 0
3 eggs dairy 1.0 2.49 2.49 1
4 pork_loin meat 1.9 3.19 6.06 0
5 whole_milk dairy 1.0 2.50 2.50 0
6 egg_custard bakery 1.0 3.99 3.99 1
>>>
>>> # Find the start index of substring 'pork' in the 'Item_name' column.
... pork_startInd = oml_cart['Item_name'].find('pork')
>>> type(pork_startInd)
<class 'oml.core.float.Float'>
>>> oml_cart.concat({'Pork_startInd': pork_startInd})
Item_name Item_type Quantity Unit_price Price Pork_startInd
0 paper_towel grocery 1.0 1.19 1.19 -1
1 ground_pork meat 2.6 2.79 7.25 7
2 tofu grocery 4.0 0.99 3.96 -1
3 eggs dairy 1.0 2.49 2.49 -1
4 pork_loin meat 1.9 3.19 6.06 0
5 whole_milk dairy 1.0 2.50 2.50 -1
6 egg_custard bakery 1.0 3.99 3.99 -1
>>>
>>> # Check whether items are of grocery category.
... is_grocery=oml_cart['Item_type']=='grocery'
>>> type(is_grocery)
<class 'oml.core.boolean.Boolean'>
>>> oml_cart.concat({'Is_grocery': is_grocery})
Item_name Item_type Quantity Unit_price Price Is_grocery
0 paper_towel grocery 1.0 1.19 1.19 True
1 ground_pork meat 2.6 2.79 7.25 False
2 tofu grocery 4.0 0.99 3.96 True
3 eggs dairy 1.0 2.49 2.49 False
4 pork_loin meat 1.9 3.19 6.06 False
5 whole_milk dairy 1.0 2.50 2.50 False
6 egg_custard bakery 1.0 3.99 3.99 False
>>>
>>> # Calculate the length of item names.
... name_length=oml_cart['Item_name'].len()
>>> type(name_length)
<class 'oml.core.float.Float'>
>>> oml_cart.concat({'Name_length': name_length})
Item_name Item_type Quantity Unit_price Price Name_length
0 paper_towel grocery 1.0 1.19 1.19 11
1 ground_pork meat 2.6 2.79 7.25 11
2 tofu grocery 4.0 0.99 3.96 4
3 eggs dairy 1.0 2.49 2.49 4
4 pork_loin meat 1.9 3.19 6.06 9
5 whole_milk dairy 1.0 2.50 2.50 10
6 egg_custard bakery 1.0 3.99 3.99 11
>>>
>>> # Get the ceiling, floor, exponential, logarithm and square root
... # of the 'Price' column.
... oml_cart['Price'].ceil()
[2, 8, 4, 3, 7, 3, 4]
>>> oml_cart['Price'].floor()
[1, 7, 3, 2, 6, 2, 3]
>>> oml_cart['Price'].exp()
[3.2870812073831184, 1408.1048482046956, 52.45732594909905, 12.061276120444719, 428.37543685928694, 12.182493960703473, 54.05488936332659]
>>> oml_cart['Price'].log()
[0.173953307123438, 1.9810014688665833, 1.3762440252663892, 0.9122827104766162, 1.801709800081223, 0.9162907318741551, 1.3837912309017721]
>>> oml_cart['Price'].sqrt()
[1.0908712114635715, 2.692582403567252, 1.98997487421324, 1.57797338380595, 2.4617067250182343, 1.5811388300841898, 1.997498435543818]
親トピック: データの探索