See More Code at my Github:

______________________________________________________________________
Below is a brief demonstration of how I implemented Python, Matlab, and R on three large data projects ranging from 200 observations to half a million. The first is an automated process that scans Department of Energy Green Button energy files. The second implemented in Matlab models 200 electric vehicles on the island of Lanai. The last in R shows my methodology for a targeted marketing statistical model that identifies the statistically significant parameter in our target market for electric vehicles.
______________________________________________________________________

Users submitted zipped and unzipped energy and gas 15 minute, hourly, and monthly data. I want just the energy data and not the gas. I also want to take the 15 minute data that exists and resample to hourly data using Python. The modules that I use to extract this XML data and place the results into one spreadsheet include Numpy for array handling, Pandas for dataframe structures and resampling, etree for xml reading, regex for regular expressions, and many others.
__author__ = 'john.horn'
from GreenButtonGrap import *
FolderWithZips = FolderContainingZippedGreenButtonFiles
OutputPathUnzip = FolderContainingUnzippedFiles
XMLSummaryDataPath = FolderPathToContainSummerizedGreenButtonData
XMLT_Location = FilePathContainingGreenButtonXLSTFile
# Unzip the files and move all unzipped and apriori unzipped xmls to the destination folder
#Also rename the xml unzipped files so that it contains the applicant ID #
DeleteContentsOfOutputFolder(OutputPathUnzip)
UnzipFilesInFolder(FolderWithZips,OutputPathUnzip)
MoveXMLInFolder(FolderWithZips,OutputPathUnzip)
#Loop through each xml file and find the interval data we are interested in
GreenButtonXML(OutputPathUnzip,XMLSummaryDataPath,XMLT_Location)
# INDIVIDUAL MODULES TO PROCESS ZIPPED GREENBUTTON FILES
#Overwrite Previous Results
def DeleteContentsOfOutputFolder(OutputFolderContainUnzip):
import os
for the_file in os.listdir(OutputFolderContainUnzip):
file_path = os.path.join(OutputFolderContainUnzip, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception, e:
print e
#Unzip XML Files
def UnzipFilesInFolder(FolderContainZips, OutputFolderContainUnzip):
import zipfile
import os
import re
RenameList = {}
if not os.path.exists(OutputFolderContainUnzip):
os.makedirs(OutputFolderContainUnzip)
filename = os.listdir(FolderContainZips)
for file in filename:
fileName, fileExtension = os.path.splitext(file)
if str(fileExtension) == '.zip':
CompleteFile = FolderContainZips + str(file)
fh = open(CompleteFile, 'rb')
z = zipfile.ZipFile(fh)
SurveyId = re.search(r'(\w+\-\w+\-)',str(file))
SurveyId = SurveyId.group(1)
#print SurveyId
for name in z.namelist():
newName = SurveyId + name
RenameList[name] = newName
#print name
#print newName
outpath = OutputFolderContainUnzip
z.extract(name, outpath)
fh.close()
UnzipFiles = os.listdir(OutputFolderContainUnzip)
os.chdir(str(OutputFolderContainUnzip))
for file in UnzipFiles:
if file in RenameList:
MyRename = str(RenameList[file])
OldFile = str(file)
OldFile = os.path.join(OutputFolderContainUnzip, OldFile)
MyRename = os.path.join(OutputFolderContainUnzip, MyRename)
os.rename(OldFile,MyRename)
#Move XML Files
def MoveXMLInFolder(FolderContainsXML,OutputFolderContainXML):
import os
from shutil import copyfile
if not os.path.exists(OutputFolderContainXML):
os.makedirs(OutputFolderContainXML)
filename = os.listdir(FolderContainsXML)
for file in filename:
fileName, fileExtension = os.path.splitext(file)
if str(fileExtension) == '.xml':
CompleteFile = FolderContainsXML + str(file)
copyfile(CompleteFile, OutputFolderContainXML + str(file))
#Extract the XML using etree
def transform(xmlPath, xslPath):
from lxml import etree
import re
# read xsl file
xslRoot = etree.fromstring(open(xslPath).read())
transform = etree.XSLT(xslRoot)
# read xml
xmlRoot = etree.fromstring(open(xmlPath).read())
# transform xml with xslt
transRoot = transform(xmlRoot)
# return transformation result
SearchText = str(etree.tostring(transRoot))
StartTime = re.findall(r'
<td>(\w+-\w+-\w+\s\w+:\w+)',SearchText)
EndTime = re.findall(r'</span>(\w+-\w+-\w+\s\w+:\w+)',SearchText)
value = re.findall(r'"right">([+-]?\d+)</td>
',SearchText)
return StartTime, value
#if __name__ == '__main__':
# print(transform('./s0101m.mul0.xml', './tipitaka-latn.xsl'))
#Main Module to sort the Gas and Energy Green Button files and extract data from XML
def GreenButtonXML(InputFolderWithXML,OutputFolderWithSummaryData,XMLT_Location):
import os
import sys
import re
import numpy as np
from pandas import *
#import pandas as pd
from datetime import datetime
import collections
if not os.path.exists(OutputFolderWithSummaryData):
os.makedirs(OutputFolderWithSummaryData)
MyXmlFiles = os.listdir(InputFolderWithXML)
Counter = 0
a = {}
#print len(MyXmlFiles)
Denom = len(MyXmlFiles)
for xmlFile in MyXmlFiles:
Counter += 1
print str(100*Counter/Denom) + "% Complete"
#print Counter
print xmlFile
if (re.search('Gas', xmlFile)) or (os.path.getsize(InputFolderWithXML + str(xmlFile)) == 0):
GasRecord = True
else:
GasRecord = False
if GasRecord == False:
if xmlFile.endswith('.xml'):
try:
NewOutput = transform(xmlFile, XMLT_Location)
StartTime = NewOutput[0]
WattHours = NewOutput[1]
ClientID = re.search(r'(\d+)-', xmlFile)
deleteList = []
for dup in sorted(list_duplicates(StartTime)):
deleteList.append(dup[1][0])
#Ideally you would take the 15 min interval over the 1 hour time step
#Need to find duplicates in the StartTime
#Take the smaller of the Watthours as this is probably the 15 min interval
deleteList = [int(i) for i in deleteList]
deleteList.sort(reverse=True)
for deleteCount in deleteList:
del StartTime[deleteCount]
del WattHours[deleteCount]
print(WattHours)
print(StartTime)
a[str(ClientID.group(1))] = Series(WattHours, index=StartTime)
df = DataFrame(a)
except AssertionError:
print "error at ID: " + ClientID.group(1)
df.to_csv(OutputFolderWithSummaryData + 'GBSummarized.csv')
from collections import defaultdict
def list_duplicates(seq):
tally = defaultdict(list)
for i,item in enumerate(seq):
tally[item].append(i)
return ((key,locs) for key,locs in tally.items()
if len(locs)>1)
______________________________________________________________________

The goal of this project was to model 200 electric vehicles on the island of Lanai, Hawaii and optimize the solar panel capacity, # chargers, and battery storage to support the vehicles. Ridership was randomized and an ensemble or random models was created to arrive at average conditions over many runs.
VehiclesInTrip = 4;
NumberOfDays = 1;
CityBatteryCapacity = 0;
ResortBatteryCapacity = 0;
NumChargersResort = 40;
NumChargersCity = 40;
SolarCapacity = 175;
%RELEASE THE CARS!
for Count3 = 1:length(Junction_Table(:,3))
%Release Resort Cars
if Junction_Table(Count3,3) == 1
for Quantity1 = 1:Junction_Table(Count3,2)
% Ask if there is an available car in the Resort if
% not skip that for loop
if length(Lanai_Resort(:,1)) == 0
Overbooked_Resort(Quarter) = Overbooked_Resort(Quarter) + 1;
continue
end
Lanai_Resort(length(Lanai_Resort(:,1)),2) = Lanai_Resort(length(Lanai_Resort(:,1)),2) - Junction_Table(Count3,4);
Lanai_Resort(length(Lanai_Resort(:,1)),4) = Lanai_Resort(length(Lanai_Resort(:,1)),4) + Junction_Table(Count3,6);
%Switching Locations
if Junction_Table(Count3,5) == 1
Lanai_Resort(length(Lanai_Resort(:,1)),3) = 0;
else
Lanai_Resort(length(Lanai_Resort(:,1)),3) = 1;
end
CarsDriving = [CarsDriving; Lanai_Resort(length(Lanai_Resort(:,1)),:)];
Lanai_Resort(length(Lanai_Resort(:,1)),:) = [];
%Possible skip location
end
else
%Release City Cars
for Quantity2 = 1:Junction_Table(Count3,2)
% Ask if there is an available car in the Resort if
% not skip that for loop
if length(Lanai_City(:,1)) == 0
Overbooked_City(Quarter) = Overbooked_City(Quarter) + 1;
continue
end
Lanai_City(length(Lanai_City(:,1)),2) = Lanai_City(length(Lanai_City(:,1)),2) - Junction_Table(Count3,4);
Lanai_City(length(Lanai_City(:,1)),4) = Lanai_City(length(Lanai_City(:,1)),4) + Junction_Table(Count3,6);
%Switching Locations
if Junction_Table(Count3,5) == 1
Lanai_City(length(Lanai_City(:,1)),3) = 1;
else
Lanai_City(length(Lanai_City(:,1)),3) = 0;
end
CarsDriving = [CarsDriving; Lanai_City(length(Lanai_City(:,1)),:)];
Lanai_City(length(Lanai_City(:,1)),:) = [];
%Possible skip location
end
end
end
%RELEASE THE CARS!
end
%RECEIVE THE CARS!
CarsAtResort(Quarter) = length(Lanai_Resort(:,1));
CarsAtCity(Quarter) = length(Lanai_City(:,1));
CarsOutDriving(Quarter) = length(CarsDriving(:,1));
%for ReceiveSkim = 1:length(CarsDriving(:,1))
CurrentRow = length(CarsDriving(:,1)) + 1;
while CurrentRow ~= 1
CurrentRow = CurrentRow - 1;
if Quarter == CarsDriving(CurrentRow,4)
%fprintf('the quarter %d \n', Quarter)
if CarsDriving(CurrentRow,3) == 0
City_Charging(Quarter) = City_Charging(Quarter) + (CarBatteryCapacity - CarsDriving(CurrentRow,2));
%ASSUMPTION INSTANT CHARGING
% CarsDriving(CurrentRow,2) = 24;
Lanai_City = [CarsDriving(CurrentRow,:); Lanai_City];
CarsDriving(CurrentRow,:) = [];
else
Resort_Charging(Quarter) = Resort_Charging(Quarter) + (CarBatteryCapacity - CarsDriving(CurrentRow,2));
%ASSUMPTION INSTANT CHARGING
% CarsDriving(CurrentRow,2) = 24;
Lanai_Resort = [CarsDriving(CurrentRow,:); Lanai_Resort];
CarsDriving(CurrentRow,:) = [];
end
end
end
%RECEIVE THE CARS!
%SORT THE VEHICLES (WE WANT TO SORT THE VEHICLES EVERY 15 MINUTES
Lanai_Resort = sortrows(Lanai_Resort,2);
Lanai_City = sortrows(Lanai_City,2);
%SORT THE VEHICLES
%CHARGE THE VEHICLES
ChargerResortEffective = NumChargersResort;
ChargerCityEffective = NumChargersCity;
%Charge Resort Vehicles
if length(Lanai_Resort(:,2)) < NumChargersResort
ChargerResortEffective = length(Lanai_Resort(:,2));
end
ChargingQuantResort = Insol_kWh2(Quarter)/ChargerResortEffective;
if ChargingQuantResort > 1.875
ChargingQuantResort = 1.875;
%fprintf('going over resort max at Q: %f \n', Quarter)
end
for CounterChargeResort = 1:ChargerResortEffective
ChargeResortDeficit = CarBatteryCapacity - Lanai_Resort(CounterChargeResort,2);
if ChargingQuantResort > ChargeResortDeficit
WastedEResortSupplyDemand(Quarter) = WastedEResortSupplyDemand(Quarter)+ (ChargingQuantResort - ChargeResortDeficit);
EnergyDelivered = ChargeResortDeficit;
%disp('testForErrorHereResort1')
testForErrorHereResort1 = 1;
else
EnergyDelivered = ChargingQuantResort;
end
Used_Energy_Resort(Quarter) = Used_Energy_Resort(Quarter) + EnergyDelivered;
LeftOverEResort(Quarter) = LeftOverEResort(Quarter) - EnergyDelivered;
Lanai_Resort(CounterChargeResort,2) = Lanai_Resort(CounterChargeResort,2) + ChargingLoss*EnergyDelivered;
end
%Charge City Vehicles
if length(Lanai_City(:,2)) < NumChargersCity
ChargerCityEffective = length(Lanai_City(:,2));
end
ChargingQuantCity = Insol_kWh2(Quarter)/ChargerCityEffective;
if ChargingQuantCity > 1.875
ChargingQuantCity = 1.875;
%fprintf('going over city max at Q: %f \n', Quarter)
end
for CounterChargeCity = 1:ChargerCityEffective
ChargeCityDeficit = CarBatteryCapacity - Lanai_City(CounterChargeCity,2);
if ChargingQuantCity > ChargeCityDeficit
WastedECitySupplyDemand(Quarter) = WastedECitySupplyDemand(Quarter) + (ChargingQuantCity - ChargeCityDeficit);
EnergyDelivered = ChargeCityDeficit;
%disp('testForErrorHereCity1')
testForErrorHereCity1 = 1;
else
EnergyDelivered = ChargingQuantCity;
end
Used_Energy_City(Quarter) = Used_Energy_City(Quarter) + EnergyDelivered;
LeftOverECity(Quarter) = LeftOverECity(Quarter) - EnergyDelivered;
Lanai_City(CounterChargeCity,2) = Lanai_City(CounterChargeCity,2) + ChargingLoss*EnergyDelivered;
end
%CHARGE THE Local Batteries
if Quarter > 1
CityBatteryLevel(Quarter) = CityBatteryLevel(Quarter - 1) + StationaryChargingLoss*LeftOverECity(Quarter);
end
if CityBatteryLevel(Quarter) > CityBatteryCapacity
CityBatteryLevel(Quarter) = CityBatteryCapacity;
end
if (CityBatteryCapacity - CityBatteryLevel(Quarter)) < LeftOverECity(Quarter)
LeftOverECity(Quarter) = LeftOverECity(Quarter) - (CityBatteryCapacity - CityBatteryLevel(Quarter));
else
LeftOverECity(Quarter) = 0;
end
if Quarter > 1
ResortBatteryLevel(Quarter) = ResortBatteryLevel(Quarter - 1) + StationaryChargingLoss*LeftOverEResort(Quarter);
end
if ResortBatteryLevel(Quarter) > ResortBatteryCapacity
ResortBatteryLevel(Quarter) = ResortBatteryCapacity;
end
if (ResortBatteryCapacity - ResortBatteryLevel(Quarter)) < LeftOverEResort(Quarter)
LeftOverEResort(Quarter) = LeftOverEResort(Quarter) - (ResortBatteryCapacity - ResortBatteryLevel(Quarter));
else
LeftOverEResort(Quarter) = 0;
end
%CHARGE VEHICLES AT NIGHT
DayScaler = floor(Quarter/97);
QuarterTest = Quarter - (96*DayScaler);
%ONLY AT NIGHT
if 84 < (QuarterTest) | (QuarterTest) < 24
ChargerResortEffective = NumChargersResort;
ChargerCityEffective = NumChargersCity;
%Charge City Vehicles
if length(Lanai_City(:,2)) < NumChargersCity
ChargerCityEffective = length(Lanai_City(:,2));
end
ChargingQuantCity = CityBatteryLevel(Quarter)/ChargerCityEffective;
if ChargingQuantCity > 1.875
ChargingQuantCity = 1.875;
end
for CounterChargeCity = 1:ChargerCityEffective
ChargeCityDeficit = CarBatteryCapacity - Lanai_City(CounterChargeCity,2);
if ChargingQuantCity > ChargeCityDeficit
EnergyDelivered = ChargeCityDeficit;
%fprintf('if %f \n', EnergyDelivered)
else
EnergyDelivered = ChargingQuantCity;
%fprintf('else %f \n', EnergyDelivered)
end
Used_Energy_City(Quarter) = Used_Energy_City(Quarter) + EnergyDelivered;
CityBatteryLevel(Quarter) = CityBatteryLevel(Quarter) - EnergyDelivered;
Lanai_City(CounterChargeCity,2) = Lanai_City(CounterChargeCity,2) + ChargingLoss*EnergyDelivered;
end
%disp(EnergyDelivered)
%fprintf('energy delivered city: %f \n', EnergyDelivered)
%fprintf('charger quant city and resort: %f & %f \n', ChargingQuantCity, ChargingQuantResort)
%Charge Resort Vehicles
if length(Lanai_Resort(:,2)) < NumChargersResort
ChargerResortEffective = length(Lanai_Resort(:,2));
end
ChargingQuantResort = ResortBatteryLevel(Quarter)/ChargerResortEffective;
if ChargingQuantResort > 1.875
ChargingQuantResort = 1.875;
end
for CounterChargeResort = 1:ChargerResortEffective
ChargeResortDeficit = CarBatteryCapacity - Lanai_Resort(CounterChargeResort,2);
if ChargingQuantResort > ChargeResortDeficit
EnergyDelivered = ChargeResortDeficit;
else
EnergyDelivered = ChargingQuantResort;
end
Used_Energy_Resort(Quarter) = Used_Energy_Resort(Quarter) + EnergyDelivered;
ResortBatteryLevel(Quarter) = ResortBatteryLevel(Quarter) - EnergyDelivered;
Lanai_Resort(CounterChargeResort,2) = Lanai_Resort(CounterChargeResort,2) + ChargingLoss*EnergyDelivered;
end
end
end
TotalEUsed(Repeat) = (sum(Used_Energy_Resort) + sum(Used_Energy_City));
TotalEWastedFromVehicleSat = (sum(LeftOverECity) + sum(LeftOverEResort) - sum(WastedEResortSupplyDemand) - sum(WastedECitySupplyDemand));
TotalEWastedFromChargingSat = sum(WastedEResortSupplyDemand) + sum(WastedECitySupplyDemand);
TotalEWasted(Repeat) = TotalEWastedFromVehicleSat + TotalEWastedFromChargingSat;
LeftOverECityFormatted(Repeat) = sum(LeftOverECity) - sum(WastedECitySupplyDemand);
LeftOverEResortFormatted(Repeat) = sum(LeftOverEResort) - sum(WastedEResortSupplyDemand);
WastedECityChargerRun(Repeat) = sum(WastedECitySupplyDemand);
WastedEResortChargerRun(Repeat) = sum(WastedEResortSupplyDemand);
TripCountLog(Repeat) = Counterooni;
end
figure;
x = linspace(1, NumberOfDays*NumOfQuarters/4,length(Resort_Charging));
plot(x,Resort_Charging, 'LineWidth', 2, 'Color', 'green');
title('Resort Charging Demand');
xlabel('Time(hour)');
ylabel('kwh demand');
hold on
title(sprintf('EV Load versus Time; %d Vehicles/Hour; Two %d kW Panels',4*VehiclesInTrip,SolarCapacity));
xlabel('Time(hour)');
ylabel('kWh Demand');
plot(x,City_Charging, 'LineWidth', 2, 'Color', 'blue')
plot(Insol_kWh_Display, 'LineWidth', 2, 'Color', 'red')
h1eg1 = legend('Resort', 'City', 'Ave. Insolation', 'Location', 'NorthEast');
h_xlabel = get(gca,'XLabel');
set(h_xlabel,'FontSize',14);
h_ylabel = get(gca,'YLabel');
set(h_ylabel,'FontSize',14);
h_Title = get(gca,'Title');
set(h_Title,'FontSize',14);
hold off
figure(2);
x = linspace(1, NumberOfDays*NumOfQuarters/4,length(Resort_Charging));
plot(x, CarsAtResort, 'LineWidth', 2, 'Color', 'green')
hold on
plot(x,CarsAtCity, 'LineWidth', 2, 'Color', 'blue')
plot(x,CarsOutDriving, 'LineWidth', 2, 'Color', 'black')
%plot(x,Overbooked_City, 'LineWidth', 2, 'Color', 'red')
%plot(x,Overbooked_Resort, 'LineWidth', 2, 'Color', 'magenta')
title(sprintf('Vehicle Tracking; %d Vehicles/Hour; Two %d kW Panels',4*VehiclesInTrip, SolarCapacity));
xlabel('Time(hour)');
ylabel('# Vehicles');
h1eg1 = legend('Resort', 'City','Driving','Location', 'NorthEast');
%h1eg1 = legend('Resort', 'City','Driving','Resort Overbooked', 'City Overbooked', 'Location', 'NorthEast');
h_xlabel = get(gca,'XLabel');
set(h_xlabel,'FontSize',14);
h_ylabel = get(gca,'YLabel');
set(h_ylabel,'FontSize',14);
h_Title = get(gca,'Title');
set(h_Title,'FontSize',14);
hold off
figure(3);
x = linspace(1, NumberOfDays*NumOfQuarters/4,length(Resort_Charging));
plot(x,Used_Energy_City, 'LineWidth', 2, 'Color', 'blue')
hold on
plot(x,Used_Energy_Resort, 'LineWidth', 2, 'Color', 'green')
title(sprintf('Solar & Battery Energy Used to Charge Vehicles(kWh); %d Vehicles/Hour; Two %d kW Panels',4*VehiclesInTrip, SolarCapacity));
xlabel('Time(hour)');
ylabel('Energy (kWh)');
h1eg1 = legend('City', 'Resort', 'Location', 'NorthEast');
h_xlabel = get(gca,'XLabel');
set(h_xlabel,'FontSize',14);
h_ylabel = get(gca,'YLabel');
set(h_ylabel,'FontSize',14);
h_Title = get(gca,'Title');
set(h_Title,'FontSize',14);
LeftOverECityFormatted = LeftOverECity - WastedECitySupplyDemand;
LeftOverEResortFormatted = LeftOverEResort - WastedEResortSupplyDemand;
figure(4)
hold off
plot(x,LeftOverECityFormatted, 'LineWidth', 2, 'Color', 'blue');
title(sprintf('Energy Not Captured by Vehicles and Batteries(kWh); %d Vehicles/Hour, Two %d kW Panels',4*VehiclesInTrip, SolarCapacity));
xlabel('Time(hour)');
ylabel('Energy (kWh)');
hold on
plot(x,LeftOverEResortFormatted, 'LineWidth', 2, 'Color', 'green');
plot(x,WastedECitySupplyDemand, 'LineWidth', 2, 'Color', [0 0 0.5]);
plot(x,WastedEResortSupplyDemand, 'LineWidth', 2, 'Color', [0 0.7 0]);
h1eg1 = legend('City Charger Saturation', 'Resort Charger Saturation', 'City Vehicle Saturation', 'Resort Vehicle Saturation', 'Location', 'NorthWest');
h_xlabel = get(gca,'XLabel');
set(h_xlabel,'FontSize',14);
h_ylabel = get(gca,'YLabel');
set(h_ylabel,'FontSize',14);
h_Title = get(gca,'Title');
set(h_Title,'FontSize',14);
% figure(5)
% plot(x, round(ResortBatteryLevel), 'LineWidth', 2, 'Color', 'green');
% hold on
% title(sprintf('Stationary Battery Levels; %d Vehicles/Hour, Two %d kW Panels',4*VehiclesInTrip, SolarCapacity));
% xlabel('Time(hour)');
% ylabel('Energy Stored (kWh)');
% plot(x, round(CityBatteryLevel), 'LineWidth', 2, 'Color', 'blue')
% h1eg1 = legend('Resort', 'City', 'Location', 'NorthEast');
%
% hold off
%
% %CityMean(Repeat) = mean(Lanai_City(:,2));
% %ReportMean(Repeat) = mean(Lanai_Resort(:,2));
%
%
% %mean(CityMean)
% %mean(ReportMean)
fprintf('Two %dkW Systems \n',SolarCapacity)
fprintf('%d Charging Stations at each site \n',NumChargersResort)
fprintf('Average Day %d Run over %d cycles \n', NumberOfDays,CycleRepeats)
fprintf('Mean Battery Level at City: %f (kWh) \n',mean(Lanai_City(:,2)))
fprintf('Mean Battery Level at Resort: %f (kWh) \n',mean(Lanai_Resort(:,2)))
% % % % Resort_Fit = polyfit(x,Resort_Charging,1);
% % % % City_Fit = polyfit(x,City_Charging,1);
% % % % fprintf('Slope Resort: %f \n', Resort_Fit(1))
% % % % fprintf('Slope City: %f \n', City_Fit(1))
fprintf('total Solar Energy / day: %f \n',(2*sum(Insol_kWh2))/NumberOfDays)
fprintf('total energy used / day: %f \n',mean(TotalEUsed)/NumberOfDays)
fprintf('total energy wasted / day: %f \n',mean(TotalEWasted)/NumberOfDays)
fprintf('Lossed %f kWh/day due to Charging Station saturation \n', mean((LeftOverECityFormatted + LeftOverEResortFormatted)/NumberOfDays))
fprintf('Lossed %f kWh/day due to Vehicle Battery saturation \n', mean((WastedEResortChargerRun + WastedECityChargerRun)/NumberOfDays))
fprintf('Mean Trip Count/day: %f +/- %f \n', mean(TripCountLog/NumberOfDays), std(TripCountLog/NumberOfDays))
%LeftOverECityFormatted
%LeftOverEResortFormatted
%WastedEResortChargerRun
%WastedECityChargerRun
% fprintf('total energy wasted + used: %f \n', TotalEUsed + TotalEWasted)
% fprintf('total energy in Resort battery: %f, \n', sum(ResortBatteryLevel))
% fprintf('total energy in City battery: %f, \n', sum(CityBatteryLevel))
%
% if (round(sum(TotalEWasted + TotalEUsed)) == round(2*sum(Insol_kWh2)))
% disp('Match')
% else
% disp('No Match')
% end
%
% fprintf('Difference is: %f , LeftOverEResort is: %f, WastedEResortSupplyDemand is: %f \n,',TotalEUsed + TotalEWasted - (2*sum(Insol_kWh2)), sum(LeftOverEResort), sum(WastedEResortSupplyDemand))
% fprintf('Difference is: %f , LeftOverECity is: %f, WastedECitySupplyDemand is: %f \n,',TotalEUsed + TotalEWasted - (2*sum(Insol_kWh2)), sum(LeftOverECity), sum(WastedECitySupplyDemand))
%
% if testForErrorHereResort1 == 1
% disp('testForErrorHereResort1 = TRUE')
% else
% disp('testForErrorHereResort1 = FALSE')
% end
% if testForErrorHereCity1 == 1
% disp('testForErrorHereCity1 = TRUE')
% else
% disp('testForErrorHereCity1 = FALSE')
% end
______________________________________________________________________

This R code uses consumer demographics to predict which part of the population to target based on features such as political affiliation, income, residence features, and many others. The point of this code is to demonstrate how I begin with raw data in R and clean the data, look for interesting correlations, conduct model selection, normalize the coefficients, and conduct a multivariate regression in R.
library(lattice)
library(leaps)
#Load Data
#-----------------------------------------------------------------------------------
MyData <- read.csv("ForR_Analysis.csv")
summary(MyData)
#Filter Data Ranges
#-----------------------------------------------------------------------------------
FiltData <- MyData
colnames(FiltData) <- c("ID", "CVRP","APN","YEAR","SQFT", "LTV","BEDS","INCOME","OCC","BATHS","GARAGE","POOL","POLIT","PARTY")
#YearBuilt make a low threshold
FiltData <- FiltData[- which(FiltData$YEAR < 1850),]
#bldgliving make a low thresholed
FiltData <- FiltData[- which(FiltData$SQFT <= 300),]
#No. of bedroom make a max threshold
FiltData <- FiltData[- which(FiltData$BEDS > 10),]
#LTV make a max threshold
FiltData <- FiltData[- which(FiltData$LTV > 400),]
#Median income make a low threshold
FiltData <- FiltData[- which(FiltData$INCOME == 0),]
#Remove NA's
#-----------------------------------------------------------------------------------
FiltData <- FiltData[complete.cases(FiltData),]
summary(FiltData)
#Cross Correlation Plot
#-----------------------------------------------------------------------------------
#We need to install the levelplot package
#Baths modify values
CROSSCOR <- FiltData[,c("CVRP","YEAR","SQFT","BEDS","LTV","INCOME","OCC","GARAGE","POOL","POLIT")]
summary(CROSSCOR)
cor(CROSSCOR)
levelplot(cor(CROSSCOR), scales=list(x=list(rot=90)))
summary(FiltData)
#Variable Center and Scale
#-------------------------------------------------------------------
X <- data.frame(matrix(ncol = 0, nrow = nrow(FiltData)))
X$SQFT <- (FiltData$SQFT - mean(FiltData$SQFT))/(2*sd(FiltData$SQFT))
X$YEAR <- (FiltData$YEAR - mean(FiltData$YEAR))/(2*sd(FiltData$YEAR))
X$BEDS <- (FiltData$BEDS - mean(FiltData$BEDS))/(2*sd(FiltData$BEDS))
X$LTV <- (FiltData$LTV - mean(FiltData$LTV))/(2*sd(FiltData$LTV))
X$INCOME <- (FiltData$INCOME - mean(FiltData$INCOME))/(2*sd(FiltData$INCOME))
X$POLIT <- (FiltData$POLIT - mean(FiltData$POLIT))/(2*sd(FiltData$POLIT))
X$GARAGE <- (FiltData$GARAGE - mean(FiltData$GARAGE))
X$POOL <- (FiltData$POOL - mean(FiltData$POOL))
X$OCC <- (FiltData$OCC - mean(FiltData$OCC))
#Linear Regression
mylogit <- glm(FiltData$CVRP ~ YEAR + SQFT + BEDS + LTV + INCOME + OCC + GARAGE + POOL + POLIT, data = X, family = "binomial")
summary(mylogit)
#--------------------------------------------------------------------------
null=lm(FiltData$CVRP~1, data=FiltData)
full=lm(FiltData$CVRP ~ YEAR + SQFT + BEDS + LTV + INCOME + OCC + GARAGE + POOL + POLIT, data = FiltData)
step(null, scope=list(lower=null, upper=full), direction="forward")
#Backward elimination
step(full, data=FiltData, direction="backward")
#Stepwise Regression
step(null, scope = list(upper=full), data=FiltData, direction="both")
#Model Selection
#BIC
# The leaps package in R contains the regsubsets() function
# for model selection. This function handles best subset
# as well as stepwise (forward and backward) regression. We
# begin with best subset regression.
# The regsubsets() function will run best subset regression
# by default. It will only save information on the single best
# model for each subset size.
# install.packages("leaps") # run to install first
###subsets <- regsubsets(lpsa ~ .,data=train)
###summary(subsets)
# To have regsubsets save information on every possible model,
# use the optional argument nbest and set really.big = TRUE. The
# nbest refers to the maximum number of subsets of each size to
# save. The maximum number of subsets for any given size is
# choose(p,floor(p/2)), where p = total number of predictors.
# Note that choose(n,k) returns the binomial coefficent for
# n choose k, and floor() is the floor function.
choose(9,0:9) # total number of models for each subset size.
# Syntax: notice that we computed 9 binomial coefficents at once.
choose(9,floor(9/2)) # 8 choose 4
#12
#subsets <- regsubsets(MyData$PART~ X$CVRP + X$BEDROOMS + X$POOL + X$ELEC + X$GAS + X$POLIT + X$BUILT + X$LTV + X$ANNCOR + X$MEDINCOMEACS + X$SEEDSPV + X$EIGTH, data=MyData,nbest=924, nvmax = 7,really.big=TRUE)
subsets <- regsubsets(FiltData$CVRP ~ YEAR + SQFT + BEDS + LTV + INCOME + OCC + GARAGE + POOL + POLIT, data = FiltData, nbest=126, nvmax = 9,really.big=TRUE)
# The summary(subsets) object has a component called bic, which
# contains the BIC for every model saved in the subsets object.
bic <- summary(subsets)$bic
summary(subsets)$which[which(bic == min(bic)),]
########## Mallow's Cp
# The summary(subsets) object has a component called cp, which
# contains the Mallow's Cp statistic for every model saved in
# the subsets object.
cp <- summary(subsets)$cp
# To determine the model which minimizes Mallow's Cp, find the
# index of the model which minimizes the Cp and extract the model
# from the summary(subsets) object.
summary(subsets)$which[which(cp == min(cp)),]


