See More Code at my Github:
______________________________________________________________________
Below is a brief demonstration of how I implemented Python, Matlab, and R on three large data projects ranging from 200 observations to half a million. The first is an automated process that scans Department of Energy Green Button energy files. The second implemented in Matlab models 200 electric vehicles on the island of Lanai. The last in R shows my methodology for a targeted marketing statistical model that identifies the statistically significant parameter in our target market for electric vehicles.
______________________________________________________________________
Users submitted zipped and unzipped energy and gas 15 minute, hourly, and monthly data. I want just the energy data and not the gas. I also want to take the 15 minute data that exists and resample to hourly data using Python. The modules that I use to extract this XML data and place the results into one spreadsheet include Numpy for array handling, Pandas for dataframe structures and resampling, etree for xml reading, regex for regular expressions, and many others.
__author__ = 'john.horn' from GreenButtonGrap import * FolderWithZips = FolderContainingZippedGreenButtonFiles OutputPathUnzip = FolderContainingUnzippedFiles XMLSummaryDataPath = FolderPathToContainSummerizedGreenButtonData XMLT_Location = FilePathContainingGreenButtonXLSTFile # Unzip the files and move all unzipped and apriori unzipped xmls to the destination folder #Also rename the xml unzipped files so that it contains the applicant ID # DeleteContentsOfOutputFolder(OutputPathUnzip) UnzipFilesInFolder(FolderWithZips,OutputPathUnzip) MoveXMLInFolder(FolderWithZips,OutputPathUnzip) #Loop through each xml file and find the interval data we are interested in GreenButtonXML(OutputPathUnzip,XMLSummaryDataPath,XMLT_Location) # INDIVIDUAL MODULES TO PROCESS ZIPPED GREENBUTTON FILES #Overwrite Previous Results def DeleteContentsOfOutputFolder(OutputFolderContainUnzip): import os for the_file in os.listdir(OutputFolderContainUnzip): file_path = os.path.join(OutputFolderContainUnzip, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception, e: print e #Unzip XML Files def UnzipFilesInFolder(FolderContainZips, OutputFolderContainUnzip): import zipfile import os import re RenameList = {} if not os.path.exists(OutputFolderContainUnzip): os.makedirs(OutputFolderContainUnzip) filename = os.listdir(FolderContainZips) for file in filename: fileName, fileExtension = os.path.splitext(file) if str(fileExtension) == '.zip': CompleteFile = FolderContainZips + str(file) fh = open(CompleteFile, 'rb') z = zipfile.ZipFile(fh) SurveyId = re.search(r'(\w+\-\w+\-)',str(file)) SurveyId = SurveyId.group(1) #print SurveyId for name in z.namelist(): newName = SurveyId + name RenameList[name] = newName #print name #print newName outpath = OutputFolderContainUnzip z.extract(name, outpath) fh.close() UnzipFiles = os.listdir(OutputFolderContainUnzip) os.chdir(str(OutputFolderContainUnzip)) for file in UnzipFiles: if file in RenameList: MyRename = str(RenameList[file]) OldFile = str(file) OldFile = os.path.join(OutputFolderContainUnzip, OldFile) MyRename = os.path.join(OutputFolderContainUnzip, MyRename) os.rename(OldFile,MyRename) #Move XML Files def MoveXMLInFolder(FolderContainsXML,OutputFolderContainXML): import os from shutil import copyfile if not os.path.exists(OutputFolderContainXML): os.makedirs(OutputFolderContainXML) filename = os.listdir(FolderContainsXML) for file in filename: fileName, fileExtension = os.path.splitext(file) if str(fileExtension) == '.xml': CompleteFile = FolderContainsXML + str(file) copyfile(CompleteFile, OutputFolderContainXML + str(file)) #Extract the XML using etree def transform(xmlPath, xslPath): from lxml import etree import re # read xsl file xslRoot = etree.fromstring(open(xslPath).read()) transform = etree.XSLT(xslRoot) # read xml xmlRoot = etree.fromstring(open(xmlPath).read()) # transform xml with xslt transRoot = transform(xmlRoot) # return transformation result SearchText = str(etree.tostring(transRoot)) StartTime = re.findall(r' <td>(\w+-\w+-\w+\s\w+:\w+)',SearchText) EndTime = re.findall(r'</span>(\w+-\w+-\w+\s\w+:\w+)',SearchText) value = re.findall(r'"right">([+-]?\d+)</td> ',SearchText) return StartTime, value #if __name__ == '__main__': # print(transform('./s0101m.mul0.xml', './tipitaka-latn.xsl')) #Main Module to sort the Gas and Energy Green Button files and extract data from XML def GreenButtonXML(InputFolderWithXML,OutputFolderWithSummaryData,XMLT_Location): import os import sys import re import numpy as np from pandas import * #import pandas as pd from datetime import datetime import collections if not os.path.exists(OutputFolderWithSummaryData): os.makedirs(OutputFolderWithSummaryData) MyXmlFiles = os.listdir(InputFolderWithXML) Counter = 0 a = {} #print len(MyXmlFiles) Denom = len(MyXmlFiles) for xmlFile in MyXmlFiles: Counter += 1 print str(100*Counter/Denom) + "% Complete" #print Counter print xmlFile if (re.search('Gas', xmlFile)) or (os.path.getsize(InputFolderWithXML + str(xmlFile)) == 0): GasRecord = True else: GasRecord = False if GasRecord == False: if xmlFile.endswith('.xml'): try: NewOutput = transform(xmlFile, XMLT_Location) StartTime = NewOutput[0] WattHours = NewOutput[1] ClientID = re.search(r'(\d+)-', xmlFile) deleteList = [] for dup in sorted(list_duplicates(StartTime)): deleteList.append(dup[1][0]) #Ideally you would take the 15 min interval over the 1 hour time step #Need to find duplicates in the StartTime #Take the smaller of the Watthours as this is probably the 15 min interval deleteList = [int(i) for i in deleteList] deleteList.sort(reverse=True) for deleteCount in deleteList: del StartTime[deleteCount] del WattHours[deleteCount] print(WattHours) print(StartTime) a[str(ClientID.group(1))] = Series(WattHours, index=StartTime) df = DataFrame(a) except AssertionError: print "error at ID: " + ClientID.group(1) df.to_csv(OutputFolderWithSummaryData + 'GBSummarized.csv') from collections import defaultdict def list_duplicates(seq): tally = defaultdict(list) for i,item in enumerate(seq): tally[item].append(i) return ((key,locs) for key,locs in tally.items() if len(locs)>1)
______________________________________________________________________
The goal of this project was to model 200 electric vehicles on the island of Lanai, Hawaii and optimize the solar panel capacity, # chargers, and battery storage to support the vehicles. Ridership was randomized and an ensemble or random models was created to arrive at average conditions over many runs.
VehiclesInTrip = 4; NumberOfDays = 1; CityBatteryCapacity = 0; ResortBatteryCapacity = 0; NumChargersResort = 40; NumChargersCity = 40; SolarCapacity = 175; %RELEASE THE CARS! for Count3 = 1:length(Junction_Table(:,3)) %Release Resort Cars if Junction_Table(Count3,3) == 1 for Quantity1 = 1:Junction_Table(Count3,2) % Ask if there is an available car in the Resort if % not skip that for loop if length(Lanai_Resort(:,1)) == 0 Overbooked_Resort(Quarter) = Overbooked_Resort(Quarter) + 1; continue end Lanai_Resort(length(Lanai_Resort(:,1)),2) = Lanai_Resort(length(Lanai_Resort(:,1)),2) - Junction_Table(Count3,4); Lanai_Resort(length(Lanai_Resort(:,1)),4) = Lanai_Resort(length(Lanai_Resort(:,1)),4) + Junction_Table(Count3,6); %Switching Locations if Junction_Table(Count3,5) == 1 Lanai_Resort(length(Lanai_Resort(:,1)),3) = 0; else Lanai_Resort(length(Lanai_Resort(:,1)),3) = 1; end CarsDriving = [CarsDriving; Lanai_Resort(length(Lanai_Resort(:,1)),:)]; Lanai_Resort(length(Lanai_Resort(:,1)),:) = []; %Possible skip location end else %Release City Cars for Quantity2 = 1:Junction_Table(Count3,2) % Ask if there is an available car in the Resort if % not skip that for loop if length(Lanai_City(:,1)) == 0 Overbooked_City(Quarter) = Overbooked_City(Quarter) + 1; continue end Lanai_City(length(Lanai_City(:,1)),2) = Lanai_City(length(Lanai_City(:,1)),2) - Junction_Table(Count3,4); Lanai_City(length(Lanai_City(:,1)),4) = Lanai_City(length(Lanai_City(:,1)),4) + Junction_Table(Count3,6); %Switching Locations if Junction_Table(Count3,5) == 1 Lanai_City(length(Lanai_City(:,1)),3) = 1; else Lanai_City(length(Lanai_City(:,1)),3) = 0; end CarsDriving = [CarsDriving; Lanai_City(length(Lanai_City(:,1)),:)]; Lanai_City(length(Lanai_City(:,1)),:) = []; %Possible skip location end end end %RELEASE THE CARS! end %RECEIVE THE CARS! CarsAtResort(Quarter) = length(Lanai_Resort(:,1)); CarsAtCity(Quarter) = length(Lanai_City(:,1)); CarsOutDriving(Quarter) = length(CarsDriving(:,1)); %for ReceiveSkim = 1:length(CarsDriving(:,1)) CurrentRow = length(CarsDriving(:,1)) + 1; while CurrentRow ~= 1 CurrentRow = CurrentRow - 1; if Quarter == CarsDriving(CurrentRow,4) %fprintf('the quarter %d \n', Quarter) if CarsDriving(CurrentRow,3) == 0 City_Charging(Quarter) = City_Charging(Quarter) + (CarBatteryCapacity - CarsDriving(CurrentRow,2)); %ASSUMPTION INSTANT CHARGING % CarsDriving(CurrentRow,2) = 24; Lanai_City = [CarsDriving(CurrentRow,:); Lanai_City]; CarsDriving(CurrentRow,:) = []; else Resort_Charging(Quarter) = Resort_Charging(Quarter) + (CarBatteryCapacity - CarsDriving(CurrentRow,2)); %ASSUMPTION INSTANT CHARGING % CarsDriving(CurrentRow,2) = 24; Lanai_Resort = [CarsDriving(CurrentRow,:); Lanai_Resort]; CarsDriving(CurrentRow,:) = []; end end end %RECEIVE THE CARS! %SORT THE VEHICLES (WE WANT TO SORT THE VEHICLES EVERY 15 MINUTES Lanai_Resort = sortrows(Lanai_Resort,2); Lanai_City = sortrows(Lanai_City,2); %SORT THE VEHICLES %CHARGE THE VEHICLES ChargerResortEffective = NumChargersResort; ChargerCityEffective = NumChargersCity; %Charge Resort Vehicles if length(Lanai_Resort(:,2)) < NumChargersResort ChargerResortEffective = length(Lanai_Resort(:,2)); end ChargingQuantResort = Insol_kWh2(Quarter)/ChargerResortEffective; if ChargingQuantResort > 1.875 ChargingQuantResort = 1.875; %fprintf('going over resort max at Q: %f \n', Quarter) end for CounterChargeResort = 1:ChargerResortEffective ChargeResortDeficit = CarBatteryCapacity - Lanai_Resort(CounterChargeResort,2); if ChargingQuantResort > ChargeResortDeficit WastedEResortSupplyDemand(Quarter) = WastedEResortSupplyDemand(Quarter)+ (ChargingQuantResort - ChargeResortDeficit); EnergyDelivered = ChargeResortDeficit; %disp('testForErrorHereResort1') testForErrorHereResort1 = 1; else EnergyDelivered = ChargingQuantResort; end Used_Energy_Resort(Quarter) = Used_Energy_Resort(Quarter) + EnergyDelivered; LeftOverEResort(Quarter) = LeftOverEResort(Quarter) - EnergyDelivered; Lanai_Resort(CounterChargeResort,2) = Lanai_Resort(CounterChargeResort,2) + ChargingLoss*EnergyDelivered; end %Charge City Vehicles if length(Lanai_City(:,2)) < NumChargersCity ChargerCityEffective = length(Lanai_City(:,2)); end ChargingQuantCity = Insol_kWh2(Quarter)/ChargerCityEffective; if ChargingQuantCity > 1.875 ChargingQuantCity = 1.875; %fprintf('going over city max at Q: %f \n', Quarter) end for CounterChargeCity = 1:ChargerCityEffective ChargeCityDeficit = CarBatteryCapacity - Lanai_City(CounterChargeCity,2); if ChargingQuantCity > ChargeCityDeficit WastedECitySupplyDemand(Quarter) = WastedECitySupplyDemand(Quarter) + (ChargingQuantCity - ChargeCityDeficit); EnergyDelivered = ChargeCityDeficit; %disp('testForErrorHereCity1') testForErrorHereCity1 = 1; else EnergyDelivered = ChargingQuantCity; end Used_Energy_City(Quarter) = Used_Energy_City(Quarter) + EnergyDelivered; LeftOverECity(Quarter) = LeftOverECity(Quarter) - EnergyDelivered; Lanai_City(CounterChargeCity,2) = Lanai_City(CounterChargeCity,2) + ChargingLoss*EnergyDelivered; end %CHARGE THE Local Batteries if Quarter > 1 CityBatteryLevel(Quarter) = CityBatteryLevel(Quarter - 1) + StationaryChargingLoss*LeftOverECity(Quarter); end if CityBatteryLevel(Quarter) > CityBatteryCapacity CityBatteryLevel(Quarter) = CityBatteryCapacity; end if (CityBatteryCapacity - CityBatteryLevel(Quarter)) < LeftOverECity(Quarter) LeftOverECity(Quarter) = LeftOverECity(Quarter) - (CityBatteryCapacity - CityBatteryLevel(Quarter)); else LeftOverECity(Quarter) = 0; end if Quarter > 1 ResortBatteryLevel(Quarter) = ResortBatteryLevel(Quarter - 1) + StationaryChargingLoss*LeftOverEResort(Quarter); end if ResortBatteryLevel(Quarter) > ResortBatteryCapacity ResortBatteryLevel(Quarter) = ResortBatteryCapacity; end if (ResortBatteryCapacity - ResortBatteryLevel(Quarter)) < LeftOverEResort(Quarter) LeftOverEResort(Quarter) = LeftOverEResort(Quarter) - (ResortBatteryCapacity - ResortBatteryLevel(Quarter)); else LeftOverEResort(Quarter) = 0; end %CHARGE VEHICLES AT NIGHT DayScaler = floor(Quarter/97); QuarterTest = Quarter - (96*DayScaler); %ONLY AT NIGHT if 84 < (QuarterTest) | (QuarterTest) < 24 ChargerResortEffective = NumChargersResort; ChargerCityEffective = NumChargersCity; %Charge City Vehicles if length(Lanai_City(:,2)) < NumChargersCity ChargerCityEffective = length(Lanai_City(:,2)); end ChargingQuantCity = CityBatteryLevel(Quarter)/ChargerCityEffective; if ChargingQuantCity > 1.875 ChargingQuantCity = 1.875; end for CounterChargeCity = 1:ChargerCityEffective ChargeCityDeficit = CarBatteryCapacity - Lanai_City(CounterChargeCity,2); if ChargingQuantCity > ChargeCityDeficit EnergyDelivered = ChargeCityDeficit; %fprintf('if %f \n', EnergyDelivered) else EnergyDelivered = ChargingQuantCity; %fprintf('else %f \n', EnergyDelivered) end Used_Energy_City(Quarter) = Used_Energy_City(Quarter) + EnergyDelivered; CityBatteryLevel(Quarter) = CityBatteryLevel(Quarter) - EnergyDelivered; Lanai_City(CounterChargeCity,2) = Lanai_City(CounterChargeCity,2) + ChargingLoss*EnergyDelivered; end %disp(EnergyDelivered) %fprintf('energy delivered city: %f \n', EnergyDelivered) %fprintf('charger quant city and resort: %f & %f \n', ChargingQuantCity, ChargingQuantResort) %Charge Resort Vehicles if length(Lanai_Resort(:,2)) < NumChargersResort ChargerResortEffective = length(Lanai_Resort(:,2)); end ChargingQuantResort = ResortBatteryLevel(Quarter)/ChargerResortEffective; if ChargingQuantResort > 1.875 ChargingQuantResort = 1.875; end for CounterChargeResort = 1:ChargerResortEffective ChargeResortDeficit = CarBatteryCapacity - Lanai_Resort(CounterChargeResort,2); if ChargingQuantResort > ChargeResortDeficit EnergyDelivered = ChargeResortDeficit; else EnergyDelivered = ChargingQuantResort; end Used_Energy_Resort(Quarter) = Used_Energy_Resort(Quarter) + EnergyDelivered; ResortBatteryLevel(Quarter) = ResortBatteryLevel(Quarter) - EnergyDelivered; Lanai_Resort(CounterChargeResort,2) = Lanai_Resort(CounterChargeResort,2) + ChargingLoss*EnergyDelivered; end end end TotalEUsed(Repeat) = (sum(Used_Energy_Resort) + sum(Used_Energy_City)); TotalEWastedFromVehicleSat = (sum(LeftOverECity) + sum(LeftOverEResort) - sum(WastedEResortSupplyDemand) - sum(WastedECitySupplyDemand)); TotalEWastedFromChargingSat = sum(WastedEResortSupplyDemand) + sum(WastedECitySupplyDemand); TotalEWasted(Repeat) = TotalEWastedFromVehicleSat + TotalEWastedFromChargingSat; LeftOverECityFormatted(Repeat) = sum(LeftOverECity) - sum(WastedECitySupplyDemand); LeftOverEResortFormatted(Repeat) = sum(LeftOverEResort) - sum(WastedEResortSupplyDemand); WastedECityChargerRun(Repeat) = sum(WastedECitySupplyDemand); WastedEResortChargerRun(Repeat) = sum(WastedEResortSupplyDemand); TripCountLog(Repeat) = Counterooni; end figure; x = linspace(1, NumberOfDays*NumOfQuarters/4,length(Resort_Charging)); plot(x,Resort_Charging, 'LineWidth', 2, 'Color', 'green'); title('Resort Charging Demand'); xlabel('Time(hour)'); ylabel('kwh demand'); hold on title(sprintf('EV Load versus Time; %d Vehicles/Hour; Two %d kW Panels',4*VehiclesInTrip,SolarCapacity)); xlabel('Time(hour)'); ylabel('kWh Demand'); plot(x,City_Charging, 'LineWidth', 2, 'Color', 'blue') plot(Insol_kWh_Display, 'LineWidth', 2, 'Color', 'red') h1eg1 = legend('Resort', 'City', 'Ave. Insolation', 'Location', 'NorthEast'); h_xlabel = get(gca,'XLabel'); set(h_xlabel,'FontSize',14); h_ylabel = get(gca,'YLabel'); set(h_ylabel,'FontSize',14); h_Title = get(gca,'Title'); set(h_Title,'FontSize',14); hold off figure(2); x = linspace(1, NumberOfDays*NumOfQuarters/4,length(Resort_Charging)); plot(x, CarsAtResort, 'LineWidth', 2, 'Color', 'green') hold on plot(x,CarsAtCity, 'LineWidth', 2, 'Color', 'blue') plot(x,CarsOutDriving, 'LineWidth', 2, 'Color', 'black') %plot(x,Overbooked_City, 'LineWidth', 2, 'Color', 'red') %plot(x,Overbooked_Resort, 'LineWidth', 2, 'Color', 'magenta') title(sprintf('Vehicle Tracking; %d Vehicles/Hour; Two %d kW Panels',4*VehiclesInTrip, SolarCapacity)); xlabel('Time(hour)'); ylabel('# Vehicles'); h1eg1 = legend('Resort', 'City','Driving','Location', 'NorthEast'); %h1eg1 = legend('Resort', 'City','Driving','Resort Overbooked', 'City Overbooked', 'Location', 'NorthEast'); h_xlabel = get(gca,'XLabel'); set(h_xlabel,'FontSize',14); h_ylabel = get(gca,'YLabel'); set(h_ylabel,'FontSize',14); h_Title = get(gca,'Title'); set(h_Title,'FontSize',14); hold off figure(3); x = linspace(1, NumberOfDays*NumOfQuarters/4,length(Resort_Charging)); plot(x,Used_Energy_City, 'LineWidth', 2, 'Color', 'blue') hold on plot(x,Used_Energy_Resort, 'LineWidth', 2, 'Color', 'green') title(sprintf('Solar & Battery Energy Used to Charge Vehicles(kWh); %d Vehicles/Hour; Two %d kW Panels',4*VehiclesInTrip, SolarCapacity)); xlabel('Time(hour)'); ylabel('Energy (kWh)'); h1eg1 = legend('City', 'Resort', 'Location', 'NorthEast'); h_xlabel = get(gca,'XLabel'); set(h_xlabel,'FontSize',14); h_ylabel = get(gca,'YLabel'); set(h_ylabel,'FontSize',14); h_Title = get(gca,'Title'); set(h_Title,'FontSize',14); LeftOverECityFormatted = LeftOverECity - WastedECitySupplyDemand; LeftOverEResortFormatted = LeftOverEResort - WastedEResortSupplyDemand; figure(4) hold off plot(x,LeftOverECityFormatted, 'LineWidth', 2, 'Color', 'blue'); title(sprintf('Energy Not Captured by Vehicles and Batteries(kWh); %d Vehicles/Hour, Two %d kW Panels',4*VehiclesInTrip, SolarCapacity)); xlabel('Time(hour)'); ylabel('Energy (kWh)'); hold on plot(x,LeftOverEResortFormatted, 'LineWidth', 2, 'Color', 'green'); plot(x,WastedECitySupplyDemand, 'LineWidth', 2, 'Color', [0 0 0.5]); plot(x,WastedEResortSupplyDemand, 'LineWidth', 2, 'Color', [0 0.7 0]); h1eg1 = legend('City Charger Saturation', 'Resort Charger Saturation', 'City Vehicle Saturation', 'Resort Vehicle Saturation', 'Location', 'NorthWest'); h_xlabel = get(gca,'XLabel'); set(h_xlabel,'FontSize',14); h_ylabel = get(gca,'YLabel'); set(h_ylabel,'FontSize',14); h_Title = get(gca,'Title'); set(h_Title,'FontSize',14); % figure(5) % plot(x, round(ResortBatteryLevel), 'LineWidth', 2, 'Color', 'green'); % hold on % title(sprintf('Stationary Battery Levels; %d Vehicles/Hour, Two %d kW Panels',4*VehiclesInTrip, SolarCapacity)); % xlabel('Time(hour)'); % ylabel('Energy Stored (kWh)'); % plot(x, round(CityBatteryLevel), 'LineWidth', 2, 'Color', 'blue') % h1eg1 = legend('Resort', 'City', 'Location', 'NorthEast'); % % hold off % % %CityMean(Repeat) = mean(Lanai_City(:,2)); % %ReportMean(Repeat) = mean(Lanai_Resort(:,2)); % % % %mean(CityMean) % %mean(ReportMean) fprintf('Two %dkW Systems \n',SolarCapacity) fprintf('%d Charging Stations at each site \n',NumChargersResort) fprintf('Average Day %d Run over %d cycles \n', NumberOfDays,CycleRepeats) fprintf('Mean Battery Level at City: %f (kWh) \n',mean(Lanai_City(:,2))) fprintf('Mean Battery Level at Resort: %f (kWh) \n',mean(Lanai_Resort(:,2))) % % % % Resort_Fit = polyfit(x,Resort_Charging,1); % % % % City_Fit = polyfit(x,City_Charging,1); % % % % fprintf('Slope Resort: %f \n', Resort_Fit(1)) % % % % fprintf('Slope City: %f \n', City_Fit(1)) fprintf('total Solar Energy / day: %f \n',(2*sum(Insol_kWh2))/NumberOfDays) fprintf('total energy used / day: %f \n',mean(TotalEUsed)/NumberOfDays) fprintf('total energy wasted / day: %f \n',mean(TotalEWasted)/NumberOfDays) fprintf('Lossed %f kWh/day due to Charging Station saturation \n', mean((LeftOverECityFormatted + LeftOverEResortFormatted)/NumberOfDays)) fprintf('Lossed %f kWh/day due to Vehicle Battery saturation \n', mean((WastedEResortChargerRun + WastedECityChargerRun)/NumberOfDays)) fprintf('Mean Trip Count/day: %f +/- %f \n', mean(TripCountLog/NumberOfDays), std(TripCountLog/NumberOfDays)) %LeftOverECityFormatted %LeftOverEResortFormatted %WastedEResortChargerRun %WastedECityChargerRun % fprintf('total energy wasted + used: %f \n', TotalEUsed + TotalEWasted) % fprintf('total energy in Resort battery: %f, \n', sum(ResortBatteryLevel)) % fprintf('total energy in City battery: %f, \n', sum(CityBatteryLevel)) % % if (round(sum(TotalEWasted + TotalEUsed)) == round(2*sum(Insol_kWh2))) % disp('Match') % else % disp('No Match') % end % % fprintf('Difference is: %f , LeftOverEResort is: %f, WastedEResortSupplyDemand is: %f \n,',TotalEUsed + TotalEWasted - (2*sum(Insol_kWh2)), sum(LeftOverEResort), sum(WastedEResortSupplyDemand)) % fprintf('Difference is: %f , LeftOverECity is: %f, WastedECitySupplyDemand is: %f \n,',TotalEUsed + TotalEWasted - (2*sum(Insol_kWh2)), sum(LeftOverECity), sum(WastedECitySupplyDemand)) % % if testForErrorHereResort1 == 1 % disp('testForErrorHereResort1 = TRUE') % else % disp('testForErrorHereResort1 = FALSE') % end % if testForErrorHereCity1 == 1 % disp('testForErrorHereCity1 = TRUE') % else % disp('testForErrorHereCity1 = FALSE') % end
______________________________________________________________________
This R code uses consumer demographics to predict which part of the population to target based on features such as political affiliation, income, residence features, and many others. The point of this code is to demonstrate how I begin with raw data in R and clean the data, look for interesting correlations, conduct model selection, normalize the coefficients, and conduct a multivariate regression in R.
library(lattice) library(leaps) #Load Data #----------------------------------------------------------------------------------- MyData <- read.csv("ForR_Analysis.csv") summary(MyData) #Filter Data Ranges #----------------------------------------------------------------------------------- FiltData <- MyData colnames(FiltData) <- c("ID", "CVRP","APN","YEAR","SQFT", "LTV","BEDS","INCOME","OCC","BATHS","GARAGE","POOL","POLIT","PARTY") #YearBuilt make a low threshold FiltData <- FiltData[- which(FiltData$YEAR < 1850),] #bldgliving make a low thresholed FiltData <- FiltData[- which(FiltData$SQFT <= 300),] #No. of bedroom make a max threshold FiltData <- FiltData[- which(FiltData$BEDS > 10),] #LTV make a max threshold FiltData <- FiltData[- which(FiltData$LTV > 400),] #Median income make a low threshold FiltData <- FiltData[- which(FiltData$INCOME == 0),] #Remove NA's #----------------------------------------------------------------------------------- FiltData <- FiltData[complete.cases(FiltData),] summary(FiltData) #Cross Correlation Plot #----------------------------------------------------------------------------------- #We need to install the levelplot package #Baths modify values CROSSCOR <- FiltData[,c("CVRP","YEAR","SQFT","BEDS","LTV","INCOME","OCC","GARAGE","POOL","POLIT")] summary(CROSSCOR) cor(CROSSCOR) levelplot(cor(CROSSCOR), scales=list(x=list(rot=90))) summary(FiltData) #Variable Center and Scale #------------------------------------------------------------------- X <- data.frame(matrix(ncol = 0, nrow = nrow(FiltData))) X$SQFT <- (FiltData$SQFT - mean(FiltData$SQFT))/(2*sd(FiltData$SQFT)) X$YEAR <- (FiltData$YEAR - mean(FiltData$YEAR))/(2*sd(FiltData$YEAR)) X$BEDS <- (FiltData$BEDS - mean(FiltData$BEDS))/(2*sd(FiltData$BEDS)) X$LTV <- (FiltData$LTV - mean(FiltData$LTV))/(2*sd(FiltData$LTV)) X$INCOME <- (FiltData$INCOME - mean(FiltData$INCOME))/(2*sd(FiltData$INCOME)) X$POLIT <- (FiltData$POLIT - mean(FiltData$POLIT))/(2*sd(FiltData$POLIT)) X$GARAGE <- (FiltData$GARAGE - mean(FiltData$GARAGE)) X$POOL <- (FiltData$POOL - mean(FiltData$POOL)) X$OCC <- (FiltData$OCC - mean(FiltData$OCC)) #Linear Regression mylogit <- glm(FiltData$CVRP ~ YEAR + SQFT + BEDS + LTV + INCOME + OCC + GARAGE + POOL + POLIT, data = X, family = "binomial") summary(mylogit) #-------------------------------------------------------------------------- null=lm(FiltData$CVRP~1, data=FiltData) full=lm(FiltData$CVRP ~ YEAR + SQFT + BEDS + LTV + INCOME + OCC + GARAGE + POOL + POLIT, data = FiltData) step(null, scope=list(lower=null, upper=full), direction="forward") #Backward elimination step(full, data=FiltData, direction="backward") #Stepwise Regression step(null, scope = list(upper=full), data=FiltData, direction="both") #Model Selection #BIC # The leaps package in R contains the regsubsets() function # for model selection. This function handles best subset # as well as stepwise (forward and backward) regression. We # begin with best subset regression. # The regsubsets() function will run best subset regression # by default. It will only save information on the single best # model for each subset size. # install.packages("leaps") # run to install first ###subsets <- regsubsets(lpsa ~ .,data=train) ###summary(subsets) # To have regsubsets save information on every possible model, # use the optional argument nbest and set really.big = TRUE. The # nbest refers to the maximum number of subsets of each size to # save. The maximum number of subsets for any given size is # choose(p,floor(p/2)), where p = total number of predictors. # Note that choose(n,k) returns the binomial coefficent for # n choose k, and floor() is the floor function. choose(9,0:9) # total number of models for each subset size. # Syntax: notice that we computed 9 binomial coefficents at once. choose(9,floor(9/2)) # 8 choose 4 #12 #subsets <- regsubsets(MyData$PART~ X$CVRP + X$BEDROOMS + X$POOL + X$ELEC + X$GAS + X$POLIT + X$BUILT + X$LTV + X$ANNCOR + X$MEDINCOMEACS + X$SEEDSPV + X$EIGTH, data=MyData,nbest=924, nvmax = 7,really.big=TRUE) subsets <- regsubsets(FiltData$CVRP ~ YEAR + SQFT + BEDS + LTV + INCOME + OCC + GARAGE + POOL + POLIT, data = FiltData, nbest=126, nvmax = 9,really.big=TRUE) # The summary(subsets) object has a component called bic, which # contains the BIC for every model saved in the subsets object. bic <- summary(subsets)$bic summary(subsets)$which[which(bic == min(bic)),] ########## Mallow's Cp # The summary(subsets) object has a component called cp, which # contains the Mallow's Cp statistic for every model saved in # the subsets object. cp <- summary(subsets)$cp # To determine the model which minimizes Mallow's Cp, find the # index of the model which minimizes the Cp and extract the model # from the summary(subsets) object. summary(subsets)$which[which(cp == min(cp)),]