====== Fuel ======
===== Reading the data =====
I downloaded (November 2012) the [[http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip|data]] from the [[http://www.fueleconomy.gov/feg/download.shtml|fuelEconomy]] page. Unziping the file we get the file ''vehicles.csv''.
> setwd("C:/Users/Batagelj/data/fuelEconomy")
> a <- read.csv("./vehicles.csv",header=TRUE,sep=",",dec=".",quote='"',
+ na.strings="",stringsAsFactors=FALSE)
> dim(a)
[1] 33057 71
> names(a)
[1] "barrels08" "barrelsA08" "charge120" "charge240" "city08"
[6] "city08U" "cityA08" "cityA08U" "cityCD" "cityE"
[11] "cityUF" "co2" "co2A" "co2TailpipeAGpm" "co2TailpipeGpm"
[16] "comb08" "comb08U" "combA08" "combA08U" "combE"
[21] "combinedCD" "combinedUF" "cylinders" "displ" "drive"
[26] "engId" "eng_dscr" "feScore" "fuelCost08" "fuelCostA08"
[31] "fuelType" "fuelType1" "ghgScore" "ghgScoreA" "highway08"
[36] "highway08U" "highwayA08" "highwayA08U" "highwayCD" "highwayE"
[41] "highwayUF" "hlv" "hpv" "id" "lv2"
[46] "lv4" "make" "model" "mpgData" "phevBlended"
[51] "pv2" "pv4" "rangeCityA" "rangeHwyA" "trany"
[56] "UCity" "UCityA" "UHighway" "UHighwayA" "VClass"
[61] "year" "youSaveSpend" "guzzler" "trans_dscr" "tCharger"
[66] "sCharger" "atvType" "fuelType2" "rangeA" "evMotor"
[71] "mfrCode"
There are data about 33057 car types described with 71 variables. To get the first impression of the data we list first five descriptions.
> head(a)
barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 cityA08U cityCD cityE cityUF
1 15.68944 0 0 0 19 0 0 0 0 0 0
2 29.95056 0 0 0 9 0 0 0 0 0 0
3 12.19557 0 0 0 23 0 0 0 0 0 0
4 29.95056 0 0 0 10 0 0 0 0 0 0
5 17.33749 0 0 0 17 0 0 0 0 0 0
6 14.96429 0 0 0 21 0 0 0 0 0 0
co2 co2A co2TailpipeAGpm co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD
1 -1 -1 0 423.1905 21 0 0 0 0 0
2 -1 -1 0 807.9091 11 0 0 0 0 0
3 -1 -1 0 329.1481 27 0 0 0 0 0
4 -1 -1 0 807.9091 11 0 0 0 0 0
5 -1 -1 0 467.7368 19 0 0 0 0 0
6 -1 -1 0 403.9545 22 0 0 0 0 0
combinedUF cylinders displ drive engId eng_dscr feScore fuelCost08
1 0 4 2.0 Rear-Wheel Drive 9011 (FFS) -1 2500
2 0 12 4.9 Rear-Wheel Drive 22020 (GUZZLER) -1 4750
3 0 4 2.2 Front-Wheel Drive 2100 (FFS) -1 1950
4 0 8 5.2 Rear-Wheel Drive 2850 -1 4750
5 0 4 2.2 4-Wheel or All-Wheel Drive 66031 (FFS,TRBO) -1 3000
6 0 4 1.8 Front-Wheel Drive 66020 (FFS) -1 2400
fuelCostA08 fuelType fuelType1 ghgScore ghgScoreA highway08 highway08U highwayA08
1 0 Regular Regular Gasoline -1 -1 25 0 0
2 0 Regular Regular Gasoline -1 -1 14 0 0
3 0 Regular Regular Gasoline -1 -1 33 0 0
4 0 Regular Regular Gasoline -1 -1 12 0 0
5 0 Premium Premium Gasoline -1 -1 23 0 0
6 0 Regular Regular Gasoline -1 -1 24 0 0
highwayA08U highwayCD highwayE highwayUF hlv hpv id lv2 lv4 make
1 0 0 0 0 0 0 1 0 0 Alfa Romeo
2 0 0 0 0 0 0 10 0 0 Ferrari
3 0 0 0 0 19 77 100 0 0 Dodge
4 0 0 0 0 0 0 1000 0 0 Dodge
5 0 0 0 0 0 0 10000 0 14 Subaru
6 0 0 0 0 0 0 10001 0 15 Subaru
model mpgData phevBlended pv2 pv4 rangeCityA rangeHwyA trany UCity
1 Spider Veloce 2000 Y false 0 0 0 0 Manual 5-spd 23.3333
2 Testarossa N false 0 0 0 0 Manual 5-spd 11.0000
3 Charger Y false 0 0 0 0 Manual 5-spd 29.0000
4 B150/B250 Wagon 2WD N false 0 0 0 0 Automatic 3-spd 12.2222
5 Legacy AWD Turbo N false 0 90 0 0 Manual 5-spd 21.0000
6 Loyale N false 0 88 0 0 Automatic 3-spd 27.0000
UCityA UHighway UHighwayA VClass year youSaveSpend guzzler trans_dscr tCharger
1 0 35.0000 0 Two Seaters 1985 -1000 NA
2 0 19.0000 0 Two Seaters 1985 -12250 T NA
3 0 47.0000 0 Subcompact Cars 1985 1750 SIL NA
4 0 16.6667 0 Vans 1985 -12250 NA
5 0 32.0000 0 Compact Cars 1993 -3500 TRUE
6 0 33.0000 0 Compact Cars 1993 -500 NA
sCharger atvType fuelType2 rangeA evMotor mfrCode
1
2
3
4
5
6
===== Selection of variables =====
We first list the basic info about all variables:
> Na <- names(a)
> for(i in 1:length(t)) {
+ v <- Na[i]; V <- a[[v]]; tv <- typeof(V)
+ cat('\n',i,v,':',tv,'\n')
+ print(summary(V))
+ cat(' NA =',sum(is.na(V)))
+ if((tv=="double")||(tv=="integer")) cat(' Nonzero =',sum(V!=0),'\n')
+ if(tv=="character") { F <- factor(V); L <- length(levels(F))
+ cat(" Levels =",L,'\n'); if(L<50) print(levels(F)) }
+ }
1 barrels08 : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.05989 14.96000 17.34000 17.90000 20.59000 47.07000
NA = 0 Nonzero = 33057
2 barrelsA08 : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.1823 0.0000 8.3200
NA = 0 Nonzero = 1000
3 charge120 : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 0 0 0
NA = 0 Nonzero = 0
4 charge240 : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000000 0.000000 0.000000 0.000711 0.000000 6.000000
NA = 0 Nonzero = 7
5 city08 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
6.0 15.0 17.0 17.4 20.0 138.0
NA = 0 Nonzero = 33057
6 city08U : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 1.433 0.000 138.300
NA = 0 Nonzero = 2415
7 cityA08 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.3484 0.0000 108.0000
NA = 0 Nonzero = 1000
8 cityA08U : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.1245 0.0000 108.0000
NA = 0 Nonzero = 327
9 cityCD : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00e+00 0.00e+00 0.00e+00 2.12e-05 0.00e+00 3.50e-01
NA = 0 Nonzero = 2
10 cityE : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.06942 0.00000 122.00000
NA = 0 Nonzero = 48
11 cityUF : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000000 0.0000000 0.0000000 0.0001124 0.0000000 0.6800000
NA = 0 Nonzero = 7
12 co2 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.00 -1.00 -1.00 12.72 -1.00 847.00
NA = 0 Nonzero = 33016
13 co2A : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.000 -1.000 -1.000 1.322 -1.000 719.000
NA = 0 Nonzero = 33057
14 co2TailpipeAGpm : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 0.00 15.54 0.00 719.00
NA = 0 Nonzero = 993
15 co2TailpipeGpm : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 404.0 467.7 483.4 555.4 1270.0
NA = 0 Nonzero = 33016
16 comb08 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
7.0 16.0 19.0 19.6 22.0 121.0
NA = 0 Nonzero = 33057
17 comb08U : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 1.623 0.000 120.900
NA = 0 Nonzero = 2415
18 combA08 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.3991 0.0000 100.0000
NA = 0 Nonzero = 1000
19 combA08U : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.1417 0.0000 100.0000
NA = 0 Nonzero = 327
20 combE : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.07321 0.00000 121.00000
NA = 0 Nonzero = 48
21 combinedCD : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000e+00 0.000e+00 0.000e+00 1.154e-05 0.000e+00 1.907e-01
NA = 0 Nonzero = 2
22 combinedUF : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000000 0.0000000 0.0000000 0.0001094 0.0000000 0.6600000
NA = 0 Nonzero = 7
23 cylinders : character
Length Class Mode
33057 character character
NA = 9 Levels = 11
[1] "-" "10" "12" "16" "2" "3" "4" "5" "6" "8" "NA"
24 displ : character
Length Class Mode
33057 character character
NA = 9 Levels = 68
25 drive : character
Length Class Mode
33057 character character
NA = 2067 Levels = 7
[1] "2-Wheel Drive" "4-Wheel Drive" "4-Wheel or All-Wheel Drive"
[4] "All-Wheel Drive" "Front-Wheel Drive" "Part-time 4-Wheel Drive"
[7] "Rear-Wheel Drive"
26 engId : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 522 10110 4845 69100
NA = 0 Nonzero = 20459
27 eng_dscr : character
Length Class Mode
33057 character character
NA = 13717 Levels = 516
28 feScore : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.0000 -1.0000 -1.0000 -0.7957 -1.0000 10.0000
NA = 0 Nonzero = 33057
29 fuelCost08 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
500 2400 2850 2914 3250 8150
NA = 0 Nonzero = 33057
30 fuelCostA08 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 0.0 0.0 118.9 0.0 5400.0
NA = 0 Nonzero = 997
31 fuelType : character
Length Class Mode
33057 character character
NA = 0 Levels = 12
[1] "CNG" "Diesel" "Electricity"
[4] "Gasoline or E85" "Gasoline or natural gas" "Gasoline or propane"
[7] "Midgrade" "Premium" "Premium Gas or Electricity"
[10] "Premium or E85" "Regular" "Regular Gas and Electricity"
32 fuelType1 : character
Length Class Mode
33057 character character
NA = 0 Levels = 6
[1] "Diesel" "Electricity" "Midgrade Gasoline" "Natural Gas"
[5] "Premium Gasoline" "Regular Gasoline"
33 ghgScore : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.0000 -1.0000 -1.0000 -0.7963 -1.0000 10.0000
NA = 0 Nonzero = 33057
34 ghgScoreA : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.0000 -1.0000 -1.0000 -0.9775 -1.0000 8.0000
NA = 0 Nonzero = 33055
35 highway08 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
9.0 19.0 23.0 23.4 27.0 105.0
NA = 0 Nonzero = 33057
36 highway08U : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 1.954 0.000 104.800
NA = 0 Nonzero = 2415
37 highwayA08 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.4846 0.0000 102.0000
NA = 0 Nonzero = 1000
38 highwayA08U : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.1725 0.0000 102.5000
NA = 0 Nonzero = 327
39 highwayCD : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 0 0 0
NA = 0 Nonzero = 0
40 highwayE : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.07794 0.00000 120.00000
NA = 0 Nonzero = 48
41 highwayUF : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000000 0.000000 0.000000 0.000105 0.000000 0.650000
NA = 0 Nonzero = 7
42 hlv : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 2.076 0.000 49.000
NA = 0 Nonzero = 4143
43 hpv : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 0.00 10.66 0.00 195.00
NA = 0 Nonzero = 4142
44 id : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 8265 16530 16590 24970 33340
NA = 0 Nonzero = 33057
45 lv2 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 1.901 0.000 41.000
NA = 0 Nonzero = 5509
46 lv4 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 6.242 13.000 55.000
NA = 0 Nonzero = 11637
47 make : character
Length Class Mode
33057 character character
NA = 0 Levels = 128
48 model : character
Length Class Mode
33057 character character
NA = 0 Levels = 3020
49 mpgData : character
Length Class Mode
33057 character character
NA = 0 Levels = 2
[1] "N" "Y"
50 phevBlended : character
Length Class Mode
33057 character character
NA = 0 Levels = 2
[1] "false" "true"
51 pv2 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 0.00 13.83 0.00 194.00
NA = 0 Nonzero = 5498
52 pv4 : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 0.00 33.43 90.00 192.00
NA = 0 Nonzero = 11637
53 rangeCityA : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.00581 0.00000 39.92000
NA = 0 Nonzero = 7
54 rangeHwyA : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.00536 0.00000 36.30000
NA = 0 Nonzero = 7
55 trany : character
Length Class Mode
33057 character character
NA = 14 Levels = 42
[1] "Auto (AV-S6)" "Auto (AV-S8)"
[3] "Auto (AV)" "Auto(A1)"
[5] "Auto(A8)" "Auto(AM-S6)"
[7] "Auto(AM-S7)" "Auto(AM5)"
[9] "Auto(AM6)" "Auto(AM7)"
[11] "Auto(AV-S6)" "Auto(AV-S7)"
[13] "Auto(AV-S8)" "Auto(L3)"
[15] "Auto(L4)" "Automatic (A1)"
[17] "Automatic (A6)" "Automatic (AM-S6)"
[19] "Automatic (AM-S7)" "Automatic (AM5)"
[21] "Automatic (AM6)" "Automatic (AV-S6)"
[23] "Automatic (AV)" "Automatic (S4)"
[25] "Automatic (S5)" "Automatic (S6)"
[27] "Automatic (S7)" "Automatic (S8)"
[29] "Automatic (variable gear ratios)" "Automatic 3-spd"
[31] "Automatic 4-spd" "Automatic 5-spd"
[33] "Automatic 6-spd" "Automatic 7-spd"
[35] "Automatic 8-spd" "Manual 3-spd"
[37] "Manual 4-spd" "Manual 4-spd Doubled"
[39] "Manual 5-spd" "Manual 5 spd"
[41] "Manual 6-spd" "Manual 7-spd"
56 UCity : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 18.00 21.00 21.83 24.56 197.60
NA = 0 Nonzero = 33032
57 UCityA : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.4161 0.0000 154.3000
NA = 0 Nonzero = 954
58 UHighway : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 26.88 32.00 32.58 37.10 149.70
NA = 0 Nonzero = 33032
59 UHighwayA : double
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.6402 0.0000 146.4000
NA = 0 Nonzero = 954
60 VClass : character
Length Class Mode
33057 character character
NA = 0 Levels = 30
[1] "Compact Cars" "Large Cars"
[3] "Midsize-Large Station Wagons" "Midsize Cars"
[5] "Midsize Station Wagons" "Minicompact Cars"
[7] "Minivan - 2WD" "Minivan - 4WD"
[9] "Small Pickup Trucks" "Small Pickup Trucks 2WD"
[11] "Small Pickup Trucks 4WD" "Small Station Wagons"
[13] "Special Purpose Vehicle" "Special Purpose Vehicle 2WD"
[15] "Special Purpose Vehicle 4WD" "Special Purpose Vehicles"
[17] "Special Purpose Vehicles/2wd" "Special Purpose Vehicles/4wd"
[19] "Sport Utility Vehicle - 2WD" "Sport Utility Vehicle - 4WD"
[21] "Standard Pickup Trucks" "Standard Pickup Trucks 2WD"
[23] "Standard Pickup Trucks 4WD" "Standard Pickup Trucks/2wd"
[25] "Subcompact Cars" "Two Seaters"
[27] "Vans" "Vans Passenger"
[29] "Vans, Cargo Type" "Vans, Passenger Type"
61 year : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
1984 1989 1998 1998 2006 2013
NA = 0 Nonzero = 33057
62 youSaveSpend : integer
Min. 1st Qu. Median Mean 3rd Qu. Max.
-29250 -4750 -2750 -3072 -500 9000
NA = 0 Nonzero = 31588
63 guzzler : character
Length Class Mode
33057 character character
NA = 31103 Levels = 3
[1] "G" "S" "T"
64 trans_dscr : character
Length Class Mode
33057 character character
NA = 18011 Levels = 52
65 tCharger : logical
Mode TRUE NA's
logical 3385 29672
NA = 29672
66 sCharger : character
Length Class Mode
33057 character character
NA = 32623 Levels = 1
[1] "S"
67 atvType : character
Length Class Mode
33057 character character
NA = 30397 Levels = 9
[1] "avail" "Bifuel (CNG)" "Bifuel (LPG)" "CNG" "Diesel"
[6] "EV" "FFV" "Hybrid" "Plug-in Hybrid"
68 fuelType2 : character
Length Class Mode
33057 character character
NA = 32057 Levels = 4
[1] "E85" "Electricity" "Natural Gas" "Propane"
69 rangeA : character
Length Class Mode
33057 character character
NA = 32062 Levels = 89
70 evMotor : character
Length Class Mode
33057 character character
NA = 32803 Levels = 46
[1] "100 kW DCPM" "101V Ni-MH" "102kW AC Induction"
[4] "107 kW AC Induction" "111 kW" "115 kW AC Induction"
[7] "115V Li-Ion" "125 kW AC Induction" "126V Li-Ion"
[10] "144V Li-Ion" "144V Ni-MH" "150 kW"
[13] "158V Ni-MH" "18 kW" "2 @ 150 kw (300 kw)"
[16] "202V Ni-MH" "24 KW AC Synchronous" "245V Ni-MH"
[19] "266V Li-Ion" "27 KW AC Induction" "270V Li-Ion"
[22] "275V Ni-MH" "288V Ni-MH" "30 kW DCPM"
[25] "300V Ni-MH" "312V Ni-MH" "330V Ni-MH"
[28] "346V Li-Ion" "36V Ni-MH" "374V Li-Ion"
[31] "49 kW DC Brushless" "49kW DC Brushless" "50 KW DC"
[34] "52 kW AC Induction" "55 kW DCPM" "56kW AC Induction"
[37] "62 KW AC Induction" "66 kW DCPM" "67 KW AC"
[40] "67 KW AC Induction" "67 KW AC Induction" "68 kW"
[43] "80 kW DCPM" "83 kWh" "85 kW AC Induction"
[46] "92 kW DC Brushless"
71 mfrCode : character
Length Class Mode
33057 character character
NA = 30820 Levels = 37
[1] "ADX" "ASX" "AZD" "BEX" "BGT" "BMX" "CDA" "CRX" "DSX" "FEX" "FJX" "FMX" "FSK" "GMX" "HNX"
[16] "HYX" "JCX" "KMX" "LRX" "LTX" "MAX" "MBX" "MLN" "MTX" "NLX" "NSX" "PRX" "RII" "RRG" "SAX"
[31] "SKX" "TKX" "TSL" "TVP" "TYX" "VVX" "VWX"
>
Simona's proposal:
1 barrels08 annual petroleum consumption in barrels for fuelType1 letna poraba goriva
5 city08 city MPG for fuelType1 mestna poraba
15 co2TailpipeGpm tailpipe CO2 in grams/mile for fuelType1 izpuh CO2
16 comb08 combined MPG for fuelType1 skupna poraba MPG
(1985-2007 ocenjen)
23 cylinders engine cylinders število valjev
24 displ engine displacement in liters prostornina motorja
25 drive drive axle type pogon
26 engId EPA model type index indeks izračunan po EPA
32 fuelType1 For single fuel vehicles, this will be the only fuel. (običajen) tip goriva
For dual fuel vehicles, this will be the conventional
35 highway08 highway MPG for fuelType1 poraba na avtocesti
47 make manufacturer (division) proizvajalec
48 model model name (carline) model avta
55 trany transmission menjalnik
60 VClass EPA vehicle size class oznaka EPA velikosti vozila
EPA = Environmental Protection Agency; MPG = mile/gallon
The variable 44 id contains a unique id of the car type.
May be also the variable 27 can be converted to something useful.
> V <- a[[27]]
> t <- table(V)
> length(t)
[1] 516
> sort(t,decreasing=TRUE)[1:20]
V
(FFS) SIDI (FFS) CA model
8828 1020 926
(FFS) (MPFI) (FFS,TRBO) FFV
734 666 454
(350 V8) (FFS) (GUZZLER) (FFS) SOHC
411 366 354
(NO-CAT) FLEX-FUEL GUZZLER
238 198 195
(FFS) (SPFI) (GUZZLER) (FFS) (MPFI) (350 V8)
194 122 120
CA model (350 V8) (FFS) (MPFI) (GM-CHEV)
113 106 102
DOHC (FFS) (DIESEL)
96 95
> U <- factor(V)
> levels(U)
For example - replacing by logical variables FFS, MPFI, GUZZLER, TRBO, SOHC, DOHC, DIESEL, ...
> V <- a[[29]]
> plot(sort(V))
> table(a[[49]])
N Y
22368 10689
> table(a[[50]])
false true
33054 3
>
Variables 29 fuelCost08, 31 fuelType, 49 mpgData, 56 UCity, 58 UHighway, 61 year, 62 youSaveSpend also seem OK. Instead of variable 48 model the variable 47 make seems better???
As we see some variables have only few "nonzero" values. May be some of them can be combined into a single variable??? Can the variables 51 pv2 and 52 pv4 be merged? What about variables 45 lv2, 46 lv4, 42 hlv, 43 hpv ???
hlv - hatchback luggage volume (cubic feet)
hpv - hatchback passenger volume (cubic feet)
lv2 - 2 door luggage volume (cubic feet)
lv4 - 4 door luggage volume (cubic feet)
pv2 - 2-door passenger volume (cubic feet)
pv4 - 4-door passenger volume (cubic feet)
Transform into 3 variables: luggage volume, passenger volume, type (2 door, 4 door, hatchback) ???
> x <- a[[42]]
> y <- a[[45]]
> z <- a[[46]]
> w <- x+y+z
> sum(w>0)
[1] 17741
> t <- (x>0)+(y>0)*2+(z>0)*4
> table(t)
t
0 1 2 3 4 5 6 7
15316 2596 3204 304 8684 952 1710 291
> w <- x+pmax(y,z)
> plot(sort(w))
The variables 47 make and 48 model (and if necessary, 61 year or 60 VClass) can be
combined into a unique car name ???
===== Names =====
How to construct a unique car type label. I tried the combinations suggested on [[http://en.wikipedia.org/wiki/Car_model|Wikipedia]]:
* make:model
* make:model-year
> b <- paste(a[[47]],":",a[[48]],sep="")
> head(b)
[1] "Alfa Romeo:Spider Veloce 2000" "Ferrari:Testarossa"
[3] "Dodge:Charger" "Dodge:B150/B250 Wagon 2WD"
[5] "Subaru:Legacy AWD Turbo" "Subaru:Loyale"
> d <- duplicated(b)
> sum(d)
[1] 29975
> b <- paste(a[[47]],":",a[[48]],"-",a[[61]],sep="")
> d <- duplicated(b)
> sum(d)
[1] 19162
> head(b)
[1] "Alfa Romeo:Spider Veloce 2000-1985" "Ferrari:Testarossa-1985"
[3] "Dodge:Charger-1985" "Dodge:B150/B250 Wagon 2WD-1985"
[5] "Subaru:Legacy AWD Turbo-1993" "Subaru:Loyale-1993"
> id <- a[[44]]
> d <- duplicated(id)
> sum(d)
[1] 0
> b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],sep="")
> head(b)
[1] "Alfa Romeo:Spider Veloce 2000-1985/Manual 5-spd"
[2] "Ferrari:Testarossa-1985/Manual 5-spd"
[3] "Dodge:Charger-1985/Manual 5-spd"
[4] "Dodge:B150/B250 Wagon 2WD-1985/Automatic 3-spd"
[5] "Subaru:Legacy AWD Turbo-1993/Manual 5-spd"
[6] "Subaru:Loyale-1993/Automatic 3-spd"
> d <- duplicated(b)
> sum(d)
[1] 11012
> b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[49]],sep="")
> sum(duplicated(b))
[1] 8729
> b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[31]],sep="")
> sum(duplicated(b))
[1] 8936
> b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[25]],sep="")
> sum(duplicated(b))
[1] 10799
> b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[23]],sep="")
> sum(duplicated(b))
[1] 7587
> b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[27]],sep="")
> sum(duplicated(b))
[1] 6172
> head(b)
[1] "Alfa Romeo:Spider Veloce 2000-1985/Manual 5-spd/(FFS)"
[2] "Ferrari:Testarossa-1985/Manual 5-spd/(GUZZLER)"
[3] "Dodge:Charger-1985/Manual 5-spd/(FFS)"
[4] "Dodge:B150/B250 Wagon 2WD-1985/Automatic 3-spd/NA"
[5] "Subaru:Legacy AWD Turbo-1993/Manual 5-spd/(FFS,TRBO)"
[6] "Subaru:Loyale-1993/Automatic 3-spd/(FFS)"
>
For the labels make+model+year we get the following maximal numbers of duplicates
> b <- paste(a[[47]],":",a[[48]],"-",a[[61]],sep="")
> t <- table(b)
> length(t)
[1] 13895
> sort(t,decreasing=TRUE)[1:20]
b
Jeep:Cherokee/Wagoneer-1985 Chevrolet:C10 Pickup 2WD-1984
24 19
Ford:F150 Pickup 2WD-1984 GMC:C15 Pickup 2WD-1984
19 19
Chevrolet:C10 Pickup 2WD-1985 Chevrolet:S10 Pickup 2WD-1984
18 18
GMC:C15 Pickup 2WD-1985 GMC:S15 Pickup 2WD-1984
18 18
Chevrolet:C10 Pickup 2WD-1986 Dodge:Ram 50 Pickup 2WD-1984
17 17
Ford:Ranger Pickup 2WD-1984 GMC:C15 Pickup 2WD-1986
17 17
Mitsubishi:Truck 2WD-1984 Chevrolet:G10/20 Van 2WD-1984
17 16
Ford:Escort-1984 Ford:Escort-1985
16 16
Ford:F150 Pickup 2WD-1985 GMC:Vandura G15/25 2WD-1984
16 16
Mercury:Lynx-1984 Volkswagen:Rabbit-1984
16 16
and for the labels make+model
> b <- paste(a[[47]],":",a[[48]],sep="")
> t <- table(b)
> length(t)
[1] 3082
> sort(t,decreasing=TRUE)[1:20]
b
Ford:F150 Pickup 2WD Ford:F150 Pickup 4WD Ford:Ranger Pickup 2WD
197 175 169
Ford:Mustang Volkswagen:Jetta GMC:Sierra 1500 2WD
160 157 149
GMC:Sierra 1500 4WD Honda:Civic Chevrolet:Camaro
149 141 136
Mitsubishi:Eclipse Honda:Accord Chevrolet:S10 Pickup 2WD
133 123 118
Ford:Ranger Pickup 4WD Toyota:Camry Dodge:D100/D150 Pickup 2WD
113 113 112
Dodge:Dakota Pickup 2WD Toyota:Corolla Chevrolet:C1500 Pickup 2WD
109 108 106
Ford:F250 Pickup 2WD Nissan:Sentra
106 106
===== Creating the data frame of selected variables =====
In the values of variable 27 engDscr we remove multiple spaces.
> mmy <- paste(a[[47]],":",a[[48]],":",a[[61]],sep="")
> head(mmy)
[1] "Alfa Romeo:Spider Veloce 2000:1985" "Ferrari:Testarossa:1985"
[3] "Dodge:Charger:1985" "Dodge:B150/B250 Wagon 2WD:1985"
[5] "Subaru:Legacy AWD Turbo:1993" "Subaru:Loyale:1993"
> mm <- paste(a[[47]],":",a[[48]],sep="")
> head(mm)
[1] "Alfa Romeo:Spider Veloce 2000" "Ferrari:Testarossa"
[3] "Dodge:Charger" "Dodge:B150/B250 Wagon 2WD"
[5] "Subaru:Legacy AWD Turbo" "Subaru:Loyale"
> ed <- gsub('[[:space:]]+', ' ',a[[27]])
> b <- data.frame(id=a[[44]],name1=mm,name2=mmy,make=a[[47]],year=a[[61]],barrels08=a[[1]],
+ city08=a[[5]],co2TailpipeGpm=a[[15]],comb08=a[[16]],cylinders=a[[23]],displ=a[[24]],
+ drive=a[[25]],engId=a[[26]],engDscr=ed,fuelCost08=a[[29]],fuelType=a[[31]],
+ fuelType1=a[[32]],highway08=a[[35]],mpgData=a[[49]],trany=a[[55]],UCity=a[[56]],
+ UHighway=a[[58]],VClass=a[[60]],youSaveSpend=a[[62]])
> head(b)
id name1 name2 make year
1 1 Alfa Romeo:Spider Veloce 2000 Alfa Romeo:Spider Veloce 2000:1985 Alfa Romeo 1985
2 10 Ferrari:Testarossa Ferrari:Testarossa:1985 Ferrari 1985
3 100 Dodge:Charger Dodge:Charger:1985 Dodge 1985
4 1000 Dodge:B150/B250 Wagon 2WD Dodge:B150/B250 Wagon 2WD:1985 Dodge 1985
5 10000 Subaru:Legacy AWD Turbo Subaru:Legacy AWD Turbo:1993 Subaru 1993
6 10001 Subaru:Loyale Subaru:Loyale:1993 Subaru 1993
barrels08 city08 co2TailpipeGpm comb08 cylinders displ drive engId
1 15.68944 19 423.1905 21 4 2.0 Rear-Wheel Drive 9011
2 29.95056 9 807.9091 11 12 4.9 Rear-Wheel Drive 22020
3 12.19557 23 329.1481 27 4 2.2 Front-Wheel Drive 2100
4 29.95056 10 807.9091 11 8 5.2 Rear-Wheel Drive 2850
5 17.33749 17 467.7368 19 4 2.2 4-Wheel or All-Wheel Drive 66031
6 14.96429 21 403.9545 22 4 1.8 Front-Wheel Drive 66020
engDscr fuelCost08 fuelType fuelType1 highway08 mpgData trany UCity
1 (FFS) 2500 Regular Regular Gasoline 25 Y Manual 5-spd 23.3333
2 (GUZZLER) 4750 Regular Regular Gasoline 14 N Manual 5-spd 11.0000
3 (FFS) 1950 Regular Regular Gasoline 33 Y Manual 5-spd 29.0000
4 4750 Regular Regular Gasoline 12 N Automatic 3-spd 12.2222
5 (FFS,TRBO) 3000 Premium Premium Gasoline 23 N Manual 5-spd 21.0000
6 (FFS) 2400 Regular Regular Gasoline 24 N Automatic 3-spd 27.0000
UHighway VClass youSaveSpend
1 35.0000 Two Seaters -1000
2 19.0000 Two Seaters -12250
3 47.0000 Subcompact Cars 1750
4 16.6667 Vans -12250
5 32.0000 Compact Cars -3500
6 33.0000 Compact Cars -500
> write.csv(b,file="fuelData.csv")