Source code for features.build_features

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
[docs]def get_indices(filename='data/raw/housing.csv'): """ Retrieves incdices from selected columns. Function loads the first row csv, then converts labels to list. Finally retreives column index from name. Parameters ---------- filename : string (optional) data location Returns ------- list list with indices """ data = pd.read_csv(filename, nrows=0) columns = list(data) return [columns.index(col) for col in ("total_rooms", "total_bedrooms", "population", "households")]
[docs]def add_extra_features(X, add_bedrooms_per_room=True): '''Adds extra features to data sets. If boolean is true adds bedrooms per room column. Also adds rooms_per_household and population_per_household columns. Parameters ---------- X : numpy.Array dataframe to modify column_name : string (optional) column containing hourly data Returns ------- X : numpy.Array transformed array . ''' rooms_ix, bedrooms_ix, population_ix, household_ix = get_indices() rooms_per_household = X[:, rooms_ix] / X[:, household_ix] population_per_household = X[:, population_ix] / X[:, household_ix] if add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household]