Tuesday, January 24, 2017

LSTM, Baseline

I ran the code shared here which does time series prediction. Based on 49 values, it predicts 50th. Simplified version is below. MSE was 0.07.

I also created a baseline, a "predictor" that would simply take X_t to be X_{t-1}. So if energy use was 10 today, it will be 10 the next day. That is the simplest predictor there is. 

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('household_power_consumption.txt', sep=';')
df = df[['Global_active_power']]
df = df[df.Global_active_power != '?']
df['G2'] = df['Global_active_power'].shift(1)
df = df.astype(float)
df['err'] = df['G2']-df['Global_active_power']
df['err'] = np.power(df['err'],2)
print df.err.sum() / len(df)

I also get MSE 0.07 from this. In ML it helps always to compare a model to a baseline. This is not to say the LSTM code is doing nothing, or maybe the model can be improved, etc. 

import matplotlib.pyplot as plt
import numpy as np, time, csv
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
np.random.seed(1234)

seq = 50

def data_power_consumption(path_to_dataset,sequence_length=seq,ratio=1.0):

    max_values = ratio * 2049280
    with open(path_to_dataset) as f:
        data = csv.reader(f, delimiter=";")
        power = []
        nb_of_values = 0
        for i,line in enumerate(data):
   #if i % 20 != 0: continue
            try:
                power.append(float(line[2]))
                nb_of_values += 1
            except ValueError:
                pass
            if nb_of_values >= max_values:
                break

    print "Data loaded from csv. Formatting..."

    result = []
    for index in range(len(power) - sequence_length):
        result.append(power[index: index + sequence_length])
    result = np.array(result)  # shape (2049230, 50)

    result_mean = result.mean()
    result -= result_mean
    print "Shift : ", result_mean
    print "Data  : ", result.shape

    row = round(0.9 * result.shape[0])
    train = result[:row, :]
    np.random.shuffle(train)
    X_train = train[:, :-1]
    y_train = train[:, -1]
    X_test = result[row:, :-1]
    y_test = result[row:, -1]

    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

    return [X_train, y_train, X_test, y_test]


ratio = 0.5
path_to_dataset = 'household_power_consumption.txt'
X_train, y_train, X_test, y_test = data_power_consumption(path_to_dataset, seq, ratio)

def build_model():
    model = Sequential()
    model.add(LSTM(input_dim=1,output_dim=seq,return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(100,return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(output_dim=1))
    model.add(Activation("linear"))
    model.compile(loss="mse", optimizer="rmsprop")
    return model

def run_network(model=None, data=None):
    global_start_time = time.time()
    ratio = 0.5
    sequence_length = seq
    path_to_dataset = 'household_power_consumption.txt'

    print 'Loading data... '
    X_train, y_train, X_test, y_test = data_power_consumption(path_to_dataset, sequence_length, ratio)
    print X_train.shape, X_test.shape
    model = build_model()
    model.fit(X_train, y_train,batch_size=512, nb_epoch=1, validation_split=0.05)
    predicted = model.predict(X_test)
    predicted = np.reshape(predicted, (predicted.size,))
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(y_test[:100])
    plt.plot(predicted[:100])
    plt.savefig('test_01.png')
    return model, y_test, predicted


run_network()