====== Fuel ====== ===== Reading the data ===== I downloaded (November 2012) the [[http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip|data]] from the [[http://www.fueleconomy.gov/feg/download.shtml|fuelEconomy]] page. Unziping the file we get the file ''vehicles.csv''. > setwd("C:/Users/Batagelj/data/fuelEconomy") > a <- read.csv("./vehicles.csv",header=TRUE,sep=",",dec=".",quote='"', + na.strings="",stringsAsFactors=FALSE) > dim(a) [1] 33057 71 > names(a) [1] "barrels08" "barrelsA08" "charge120" "charge240" "city08" [6] "city08U" "cityA08" "cityA08U" "cityCD" "cityE" [11] "cityUF" "co2" "co2A" "co2TailpipeAGpm" "co2TailpipeGpm" [16] "comb08" "comb08U" "combA08" "combA08U" "combE" [21] "combinedCD" "combinedUF" "cylinders" "displ" "drive" [26] "engId" "eng_dscr" "feScore" "fuelCost08" "fuelCostA08" [31] "fuelType" "fuelType1" "ghgScore" "ghgScoreA" "highway08" [36] "highway08U" "highwayA08" "highwayA08U" "highwayCD" "highwayE" [41] "highwayUF" "hlv" "hpv" "id" "lv2" [46] "lv4" "make" "model" "mpgData" "phevBlended" [51] "pv2" "pv4" "rangeCityA" "rangeHwyA" "trany" [56] "UCity" "UCityA" "UHighway" "UHighwayA" "VClass" [61] "year" "youSaveSpend" "guzzler" "trans_dscr" "tCharger" [66] "sCharger" "atvType" "fuelType2" "rangeA" "evMotor" [71] "mfrCode" There are data about 33057 car types described with 71 variables. To get the first impression of the data we list first five descriptions. > head(a) barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 cityA08U cityCD cityE cityUF 1 15.68944 0 0 0 19 0 0 0 0 0 0 2 29.95056 0 0 0 9 0 0 0 0 0 0 3 12.19557 0 0 0 23 0 0 0 0 0 0 4 29.95056 0 0 0 10 0 0 0 0 0 0 5 17.33749 0 0 0 17 0 0 0 0 0 0 6 14.96429 0 0 0 21 0 0 0 0 0 0 co2 co2A co2TailpipeAGpm co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD 1 -1 -1 0 423.1905 21 0 0 0 0 0 2 -1 -1 0 807.9091 11 0 0 0 0 0 3 -1 -1 0 329.1481 27 0 0 0 0 0 4 -1 -1 0 807.9091 11 0 0 0 0 0 5 -1 -1 0 467.7368 19 0 0 0 0 0 6 -1 -1 0 403.9545 22 0 0 0 0 0 combinedUF cylinders displ drive engId eng_dscr feScore fuelCost08 1 0 4 2.0 Rear-Wheel Drive 9011 (FFS) -1 2500 2 0 12 4.9 Rear-Wheel Drive 22020 (GUZZLER) -1 4750 3 0 4 2.2 Front-Wheel Drive 2100 (FFS) -1 1950 4 0 8 5.2 Rear-Wheel Drive 2850 -1 4750 5 0 4 2.2 4-Wheel or All-Wheel Drive 66031 (FFS,TRBO) -1 3000 6 0 4 1.8 Front-Wheel Drive 66020 (FFS) -1 2400 fuelCostA08 fuelType fuelType1 ghgScore ghgScoreA highway08 highway08U highwayA08 1 0 Regular Regular Gasoline -1 -1 25 0 0 2 0 Regular Regular Gasoline -1 -1 14 0 0 3 0 Regular Regular Gasoline -1 -1 33 0 0 4 0 Regular Regular Gasoline -1 -1 12 0 0 5 0 Premium Premium Gasoline -1 -1 23 0 0 6 0 Regular Regular Gasoline -1 -1 24 0 0 highwayA08U highwayCD highwayE highwayUF hlv hpv id lv2 lv4 make 1 0 0 0 0 0 0 1 0 0 Alfa Romeo 2 0 0 0 0 0 0 10 0 0 Ferrari 3 0 0 0 0 19 77 100 0 0 Dodge 4 0 0 0 0 0 0 1000 0 0 Dodge 5 0 0 0 0 0 0 10000 0 14 Subaru 6 0 0 0 0 0 0 10001 0 15 Subaru model mpgData phevBlended pv2 pv4 rangeCityA rangeHwyA trany UCity 1 Spider Veloce 2000 Y false 0 0 0 0 Manual 5-spd 23.3333 2 Testarossa N false 0 0 0 0 Manual 5-spd 11.0000 3 Charger Y false 0 0 0 0 Manual 5-spd 29.0000 4 B150/B250 Wagon 2WD N false 0 0 0 0 Automatic 3-spd 12.2222 5 Legacy AWD Turbo N false 0 90 0 0 Manual 5-spd 21.0000 6 Loyale N false 0 88 0 0 Automatic 3-spd 27.0000 UCityA UHighway UHighwayA VClass year youSaveSpend guzzler trans_dscr tCharger 1 0 35.0000 0 Two Seaters 1985 -1000 NA 2 0 19.0000 0 Two Seaters 1985 -12250 T NA 3 0 47.0000 0 Subcompact Cars 1985 1750 SIL NA 4 0 16.6667 0 Vans 1985 -12250 NA 5 0 32.0000 0 Compact Cars 1993 -3500 TRUE 6 0 33.0000 0 Compact Cars 1993 -500 NA sCharger atvType fuelType2 rangeA evMotor mfrCode 1 2 3 4 5 6 ===== Selection of variables ===== We first list the basic info about all variables: > Na <- names(a) > for(i in 1:length(t)) { + v <- Na[i]; V <- a[[v]]; tv <- typeof(V) + cat('\n',i,v,':',tv,'\n') + print(summary(V)) + cat(' NA =',sum(is.na(V))) + if((tv=="double")||(tv=="integer")) cat(' Nonzero =',sum(V!=0),'\n') + if(tv=="character") { F <- factor(V); L <- length(levels(F)) + cat(" Levels =",L,'\n'); if(L<50) print(levels(F)) } + } 1 barrels08 : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.05989 14.96000 17.34000 17.90000 20.59000 47.07000 NA = 0 Nonzero = 33057 2 barrelsA08 : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.1823 0.0000 8.3200 NA = 0 Nonzero = 1000 3 charge120 : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0 0 0 0 0 0 NA = 0 Nonzero = 0 4 charge240 : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000000 0.000000 0.000000 0.000711 0.000000 6.000000 NA = 0 Nonzero = 7 5 city08 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 6.0 15.0 17.0 17.4 20.0 138.0 NA = 0 Nonzero = 33057 6 city08U : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000 0.000 0.000 1.433 0.000 138.300 NA = 0 Nonzero = 2415 7 cityA08 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.3484 0.0000 108.0000 NA = 0 Nonzero = 1000 8 cityA08U : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.1245 0.0000 108.0000 NA = 0 Nonzero = 327 9 cityCD : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00e+00 0.00e+00 0.00e+00 2.12e-05 0.00e+00 3.50e-01 NA = 0 Nonzero = 2 10 cityE : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00000 0.00000 0.00000 0.06942 0.00000 122.00000 NA = 0 Nonzero = 48 11 cityUF : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000000 0.0000000 0.0000000 0.0001124 0.0000000 0.6800000 NA = 0 Nonzero = 7 12 co2 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. -1.00 -1.00 -1.00 12.72 -1.00 847.00 NA = 0 Nonzero = 33016 13 co2A : integer Min. 1st Qu. Median Mean 3rd Qu. Max. -1.000 -1.000 -1.000 1.322 -1.000 719.000 NA = 0 Nonzero = 33057 14 co2TailpipeAGpm : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 15.54 0.00 719.00 NA = 0 Nonzero = 993 15 co2TailpipeGpm : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 404.0 467.7 483.4 555.4 1270.0 NA = 0 Nonzero = 33016 16 comb08 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 7.0 16.0 19.0 19.6 22.0 121.0 NA = 0 Nonzero = 33057 17 comb08U : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000 0.000 0.000 1.623 0.000 120.900 NA = 0 Nonzero = 2415 18 combA08 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.3991 0.0000 100.0000 NA = 0 Nonzero = 1000 19 combA08U : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.1417 0.0000 100.0000 NA = 0 Nonzero = 327 20 combE : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00000 0.00000 0.00000 0.07321 0.00000 121.00000 NA = 0 Nonzero = 48 21 combinedCD : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000e+00 0.000e+00 0.000e+00 1.154e-05 0.000e+00 1.907e-01 NA = 0 Nonzero = 2 22 combinedUF : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000000 0.0000000 0.0000000 0.0001094 0.0000000 0.6600000 NA = 0 Nonzero = 7 23 cylinders : character Length Class Mode 33057 character character NA = 9 Levels = 11 [1] "-" "10" "12" "16" "2" "3" "4" "5" "6" "8" "NA" 24 displ : character Length Class Mode 33057 character character NA = 9 Levels = 68 25 drive : character Length Class Mode 33057 character character NA = 2067 Levels = 7 [1] "2-Wheel Drive" "4-Wheel Drive" "4-Wheel or All-Wheel Drive" [4] "All-Wheel Drive" "Front-Wheel Drive" "Part-time 4-Wheel Drive" [7] "Rear-Wheel Drive" 26 engId : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0 0 522 10110 4845 69100 NA = 0 Nonzero = 20459 27 eng_dscr : character Length Class Mode 33057 character character NA = 13717 Levels = 516 28 feScore : integer Min. 1st Qu. Median Mean 3rd Qu. Max. -1.0000 -1.0000 -1.0000 -0.7957 -1.0000 10.0000 NA = 0 Nonzero = 33057 29 fuelCost08 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 500 2400 2850 2914 3250 8150 NA = 0 Nonzero = 33057 30 fuelCostA08 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 0.0 0.0 118.9 0.0 5400.0 NA = 0 Nonzero = 997 31 fuelType : character Length Class Mode 33057 character character NA = 0 Levels = 12 [1] "CNG" "Diesel" "Electricity" [4] "Gasoline or E85" "Gasoline or natural gas" "Gasoline or propane" [7] "Midgrade" "Premium" "Premium Gas or Electricity" [10] "Premium or E85" "Regular" "Regular Gas and Electricity" 32 fuelType1 : character Length Class Mode 33057 character character NA = 0 Levels = 6 [1] "Diesel" "Electricity" "Midgrade Gasoline" "Natural Gas" [5] "Premium Gasoline" "Regular Gasoline" 33 ghgScore : integer Min. 1st Qu. Median Mean 3rd Qu. Max. -1.0000 -1.0000 -1.0000 -0.7963 -1.0000 10.0000 NA = 0 Nonzero = 33057 34 ghgScoreA : integer Min. 1st Qu. Median Mean 3rd Qu. Max. -1.0000 -1.0000 -1.0000 -0.9775 -1.0000 8.0000 NA = 0 Nonzero = 33055 35 highway08 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 9.0 19.0 23.0 23.4 27.0 105.0 NA = 0 Nonzero = 33057 36 highway08U : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000 0.000 0.000 1.954 0.000 104.800 NA = 0 Nonzero = 2415 37 highwayA08 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.4846 0.0000 102.0000 NA = 0 Nonzero = 1000 38 highwayA08U : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.1725 0.0000 102.5000 NA = 0 Nonzero = 327 39 highwayCD : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0 0 0 0 0 0 NA = 0 Nonzero = 0 40 highwayE : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00000 0.00000 0.00000 0.07794 0.00000 120.00000 NA = 0 Nonzero = 48 41 highwayUF : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000000 0.000000 0.000000 0.000105 0.000000 0.650000 NA = 0 Nonzero = 7 42 hlv : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000 0.000 0.000 2.076 0.000 49.000 NA = 0 Nonzero = 4143 43 hpv : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 10.66 0.00 195.00 NA = 0 Nonzero = 4142 44 id : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 1 8265 16530 16590 24970 33340 NA = 0 Nonzero = 33057 45 lv2 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000 0.000 0.000 1.901 0.000 41.000 NA = 0 Nonzero = 5509 46 lv4 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000 0.000 0.000 6.242 13.000 55.000 NA = 0 Nonzero = 11637 47 make : character Length Class Mode 33057 character character NA = 0 Levels = 128 48 model : character Length Class Mode 33057 character character NA = 0 Levels = 3020 49 mpgData : character Length Class Mode 33057 character character NA = 0 Levels = 2 [1] "N" "Y" 50 phevBlended : character Length Class Mode 33057 character character NA = 0 Levels = 2 [1] "false" "true" 51 pv2 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 13.83 0.00 194.00 NA = 0 Nonzero = 5498 52 pv4 : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 33.43 90.00 192.00 NA = 0 Nonzero = 11637 53 rangeCityA : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00000 0.00000 0.00000 0.00581 0.00000 39.92000 NA = 0 Nonzero = 7 54 rangeHwyA : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00000 0.00000 0.00000 0.00536 0.00000 36.30000 NA = 0 Nonzero = 7 55 trany : character Length Class Mode 33057 character character NA = 14 Levels = 42 [1] "Auto (AV-S6)" "Auto (AV-S8)" [3] "Auto (AV)" "Auto(A1)" [5] "Auto(A8)" "Auto(AM-S6)" [7] "Auto(AM-S7)" "Auto(AM5)" [9] "Auto(AM6)" "Auto(AM7)" [11] "Auto(AV-S6)" "Auto(AV-S7)" [13] "Auto(AV-S8)" "Auto(L3)" [15] "Auto(L4)" "Automatic (A1)" [17] "Automatic (A6)" "Automatic (AM-S6)" [19] "Automatic (AM-S7)" "Automatic (AM5)" [21] "Automatic (AM6)" "Automatic (AV-S6)" [23] "Automatic (AV)" "Automatic (S4)" [25] "Automatic (S5)" "Automatic (S6)" [27] "Automatic (S7)" "Automatic (S8)" [29] "Automatic (variable gear ratios)" "Automatic 3-spd" [31] "Automatic 4-spd" "Automatic 5-spd" [33] "Automatic 6-spd" "Automatic 7-spd" [35] "Automatic 8-spd" "Manual 3-spd" [37] "Manual 4-spd" "Manual 4-spd Doubled" [39] "Manual 5-spd" "Manual 5 spd" [41] "Manual 6-spd" "Manual 7-spd" 56 UCity : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 18.00 21.00 21.83 24.56 197.60 NA = 0 Nonzero = 33032 57 UCityA : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.4161 0.0000 154.3000 NA = 0 Nonzero = 954 58 UHighway : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 26.88 32.00 32.58 37.10 149.70 NA = 0 Nonzero = 33032 59 UHighwayA : double Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 0.0000 0.6402 0.0000 146.4000 NA = 0 Nonzero = 954 60 VClass : character Length Class Mode 33057 character character NA = 0 Levels = 30 [1] "Compact Cars" "Large Cars" [3] "Midsize-Large Station Wagons" "Midsize Cars" [5] "Midsize Station Wagons" "Minicompact Cars" [7] "Minivan - 2WD" "Minivan - 4WD" [9] "Small Pickup Trucks" "Small Pickup Trucks 2WD" [11] "Small Pickup Trucks 4WD" "Small Station Wagons" [13] "Special Purpose Vehicle" "Special Purpose Vehicle 2WD" [15] "Special Purpose Vehicle 4WD" "Special Purpose Vehicles" [17] "Special Purpose Vehicles/2wd" "Special Purpose Vehicles/4wd" [19] "Sport Utility Vehicle - 2WD" "Sport Utility Vehicle - 4WD" [21] "Standard Pickup Trucks" "Standard Pickup Trucks 2WD" [23] "Standard Pickup Trucks 4WD" "Standard Pickup Trucks/2wd" [25] "Subcompact Cars" "Two Seaters" [27] "Vans" "Vans Passenger" [29] "Vans, Cargo Type" "Vans, Passenger Type" 61 year : integer Min. 1st Qu. Median Mean 3rd Qu. Max. 1984 1989 1998 1998 2006 2013 NA = 0 Nonzero = 33057 62 youSaveSpend : integer Min. 1st Qu. Median Mean 3rd Qu. Max. -29250 -4750 -2750 -3072 -500 9000 NA = 0 Nonzero = 31588 63 guzzler : character Length Class Mode 33057 character character NA = 31103 Levels = 3 [1] "G" "S" "T" 64 trans_dscr : character Length Class Mode 33057 character character NA = 18011 Levels = 52 65 tCharger : logical Mode TRUE NA's logical 3385 29672 NA = 29672 66 sCharger : character Length Class Mode 33057 character character NA = 32623 Levels = 1 [1] "S" 67 atvType : character Length Class Mode 33057 character character NA = 30397 Levels = 9 [1] "avail" "Bifuel (CNG)" "Bifuel (LPG)" "CNG" "Diesel" [6] "EV" "FFV" "Hybrid" "Plug-in Hybrid" 68 fuelType2 : character Length Class Mode 33057 character character NA = 32057 Levels = 4 [1] "E85" "Electricity" "Natural Gas" "Propane" 69 rangeA : character Length Class Mode 33057 character character NA = 32062 Levels = 89 70 evMotor : character Length Class Mode 33057 character character NA = 32803 Levels = 46 [1] "100 kW DCPM" "101V Ni-MH" "102kW AC Induction" [4] "107 kW AC Induction" "111 kW" "115 kW AC Induction" [7] "115V Li-Ion" "125 kW AC Induction" "126V Li-Ion" [10] "144V Li-Ion" "144V Ni-MH" "150 kW" [13] "158V Ni-MH" "18 kW" "2 @ 150 kw (300 kw)" [16] "202V Ni-MH" "24 KW AC Synchronous" "245V Ni-MH" [19] "266V Li-Ion" "27 KW AC Induction" "270V Li-Ion" [22] "275V Ni-MH" "288V Ni-MH" "30 kW DCPM" [25] "300V Ni-MH" "312V Ni-MH" "330V Ni-MH" [28] "346V Li-Ion" "36V Ni-MH" "374V Li-Ion" [31] "49 kW DC Brushless" "49kW DC Brushless" "50 KW DC" [34] "52 kW AC Induction" "55 kW DCPM" "56kW AC Induction" [37] "62 KW AC Induction" "66 kW DCPM" "67 KW AC" [40] "67 KW AC Induction" "67 KW AC Induction" "68 kW" [43] "80 kW DCPM" "83 kWh" "85 kW AC Induction" [46] "92 kW DC Brushless" 71 mfrCode : character Length Class Mode 33057 character character NA = 30820 Levels = 37 [1] "ADX" "ASX" "AZD" "BEX" "BGT" "BMX" "CDA" "CRX" "DSX" "FEX" "FJX" "FMX" "FSK" "GMX" "HNX" [16] "HYX" "JCX" "KMX" "LRX" "LTX" "MAX" "MBX" "MLN" "MTX" "NLX" "NSX" "PRX" "RII" "RRG" "SAX" [31] "SKX" "TKX" "TSL" "TVP" "TYX" "VVX" "VWX" > Simona's proposal: 1 barrels08 annual petroleum consumption in barrels for fuelType1 letna poraba goriva 5 city08 city MPG for fuelType1 mestna poraba 15 co2TailpipeGpm tailpipe CO2 in grams/mile for fuelType1 izpuh CO2 16 comb08 combined MPG for fuelType1 skupna poraba MPG (1985-2007 ocenjen) 23 cylinders engine cylinders število valjev 24 displ engine displacement in liters prostornina motorja 25 drive drive axle type pogon 26 engId EPA model type index indeks izračunan po EPA 32 fuelType1 For single fuel vehicles, this will be the only fuel. (običajen) tip goriva For dual fuel vehicles, this will be the conventional 35 highway08 highway MPG for fuelType1 poraba na avtocesti 47 make manufacturer (division) proizvajalec 48 model model name (carline) model avta 55 trany transmission menjalnik 60 VClass EPA vehicle size class oznaka EPA velikosti vozila EPA = Environmental Protection Agency; MPG = mile/gallon The variable 44 id contains a unique id of the car type. May be also the variable 27 can be converted to something useful. > V <- a[[27]] > t <- table(V) > length(t) [1] 516 > sort(t,decreasing=TRUE)[1:20] V (FFS) SIDI (FFS) CA model 8828 1020 926 (FFS) (MPFI) (FFS,TRBO) FFV 734 666 454 (350 V8) (FFS) (GUZZLER) (FFS) SOHC 411 366 354 (NO-CAT) FLEX-FUEL GUZZLER 238 198 195 (FFS) (SPFI) (GUZZLER) (FFS) (MPFI) (350 V8) 194 122 120 CA model (350 V8) (FFS) (MPFI) (GM-CHEV) 113 106 102 DOHC (FFS) (DIESEL) 96 95 > U <- factor(V) > levels(U) For example - replacing by logical variables FFS, MPFI, GUZZLER, TRBO, SOHC, DOHC, DIESEL, ... > V <- a[[29]] > plot(sort(V)) > table(a[[49]]) N Y 22368 10689 > table(a[[50]]) false true 33054 3 > Variables 29 fuelCost08, 31 fuelType, 49 mpgData, 56 UCity, 58 UHighway, 61 year, 62 youSaveSpend also seem OK. Instead of variable 48 model the variable 47 make seems better??? As we see some variables have only few "nonzero" values. May be some of them can be combined into a single variable??? Can the variables 51 pv2 and 52 pv4 be merged? What about variables 45 lv2, 46 lv4, 42 hlv, 43 hpv ??? hlv - hatchback luggage volume (cubic feet) hpv - hatchback passenger volume (cubic feet) lv2 - 2 door luggage volume (cubic feet) lv4 - 4 door luggage volume (cubic feet) pv2 - 2-door passenger volume (cubic feet) pv4 - 4-door passenger volume (cubic feet) Transform into 3 variables: luggage volume, passenger volume, type (2 door, 4 door, hatchback) ??? > x <- a[[42]] > y <- a[[45]] > z <- a[[46]] > w <- x+y+z > sum(w>0) [1] 17741 > t <- (x>0)+(y>0)*2+(z>0)*4 > table(t) t 0 1 2 3 4 5 6 7 15316 2596 3204 304 8684 952 1710 291 > w <- x+pmax(y,z) > plot(sort(w)) The variables 47 make and 48 model (and if necessary, 61 year or 60 VClass) can be combined into a unique car name ??? ===== Names ===== How to construct a unique car type label. I tried the combinations suggested on [[http://en.wikipedia.org/wiki/Car_model|Wikipedia]]: * make:model * make:model-year > b <- paste(a[[47]],":",a[[48]],sep="") > head(b) [1] "Alfa Romeo:Spider Veloce 2000" "Ferrari:Testarossa" [3] "Dodge:Charger" "Dodge:B150/B250 Wagon 2WD" [5] "Subaru:Legacy AWD Turbo" "Subaru:Loyale" > d <- duplicated(b) > sum(d) [1] 29975 > b <- paste(a[[47]],":",a[[48]],"-",a[[61]],sep="") > d <- duplicated(b) > sum(d) [1] 19162 > head(b) [1] "Alfa Romeo:Spider Veloce 2000-1985" "Ferrari:Testarossa-1985" [3] "Dodge:Charger-1985" "Dodge:B150/B250 Wagon 2WD-1985" [5] "Subaru:Legacy AWD Turbo-1993" "Subaru:Loyale-1993" > id <- a[[44]] > d <- duplicated(id) > sum(d) [1] 0 > b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],sep="") > head(b) [1] "Alfa Romeo:Spider Veloce 2000-1985/Manual 5-spd" [2] "Ferrari:Testarossa-1985/Manual 5-spd" [3] "Dodge:Charger-1985/Manual 5-spd" [4] "Dodge:B150/B250 Wagon 2WD-1985/Automatic 3-spd" [5] "Subaru:Legacy AWD Turbo-1993/Manual 5-spd" [6] "Subaru:Loyale-1993/Automatic 3-spd" > d <- duplicated(b) > sum(d) [1] 11012 > b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[49]],sep="") > sum(duplicated(b)) [1] 8729 > b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[31]],sep="") > sum(duplicated(b)) [1] 8936 > b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[25]],sep="") > sum(duplicated(b)) [1] 10799 > b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[23]],sep="") > sum(duplicated(b)) [1] 7587 > b <- paste(a[[47]],":",a[[48]],"-",a[[61]],"/",a[[55]],"/",a[[27]],sep="") > sum(duplicated(b)) [1] 6172 > head(b) [1] "Alfa Romeo:Spider Veloce 2000-1985/Manual 5-spd/(FFS)" [2] "Ferrari:Testarossa-1985/Manual 5-spd/(GUZZLER)" [3] "Dodge:Charger-1985/Manual 5-spd/(FFS)" [4] "Dodge:B150/B250 Wagon 2WD-1985/Automatic 3-spd/NA" [5] "Subaru:Legacy AWD Turbo-1993/Manual 5-spd/(FFS,TRBO)" [6] "Subaru:Loyale-1993/Automatic 3-spd/(FFS)" > For the labels make+model+year we get the following maximal numbers of duplicates > b <- paste(a[[47]],":",a[[48]],"-",a[[61]],sep="") > t <- table(b) > length(t) [1] 13895 > sort(t,decreasing=TRUE)[1:20] b Jeep:Cherokee/Wagoneer-1985 Chevrolet:C10 Pickup 2WD-1984 24 19 Ford:F150 Pickup 2WD-1984 GMC:C15 Pickup 2WD-1984 19 19 Chevrolet:C10 Pickup 2WD-1985 Chevrolet:S10 Pickup 2WD-1984 18 18 GMC:C15 Pickup 2WD-1985 GMC:S15 Pickup 2WD-1984 18 18 Chevrolet:C10 Pickup 2WD-1986 Dodge:Ram 50 Pickup 2WD-1984 17 17 Ford:Ranger Pickup 2WD-1984 GMC:C15 Pickup 2WD-1986 17 17 Mitsubishi:Truck 2WD-1984 Chevrolet:G10/20 Van 2WD-1984 17 16 Ford:Escort-1984 Ford:Escort-1985 16 16 Ford:F150 Pickup 2WD-1985 GMC:Vandura G15/25 2WD-1984 16 16 Mercury:Lynx-1984 Volkswagen:Rabbit-1984 16 16 and for the labels make+model > b <- paste(a[[47]],":",a[[48]],sep="") > t <- table(b) > length(t) [1] 3082 > sort(t,decreasing=TRUE)[1:20] b Ford:F150 Pickup 2WD Ford:F150 Pickup 4WD Ford:Ranger Pickup 2WD 197 175 169 Ford:Mustang Volkswagen:Jetta GMC:Sierra 1500 2WD 160 157 149 GMC:Sierra 1500 4WD Honda:Civic Chevrolet:Camaro 149 141 136 Mitsubishi:Eclipse Honda:Accord Chevrolet:S10 Pickup 2WD 133 123 118 Ford:Ranger Pickup 4WD Toyota:Camry Dodge:D100/D150 Pickup 2WD 113 113 112 Dodge:Dakota Pickup 2WD Toyota:Corolla Chevrolet:C1500 Pickup 2WD 109 108 106 Ford:F250 Pickup 2WD Nissan:Sentra 106 106 ===== Creating the data frame of selected variables ===== In the values of variable 27 engDscr we remove multiple spaces. > mmy <- paste(a[[47]],":",a[[48]],":",a[[61]],sep="") > head(mmy) [1] "Alfa Romeo:Spider Veloce 2000:1985" "Ferrari:Testarossa:1985" [3] "Dodge:Charger:1985" "Dodge:B150/B250 Wagon 2WD:1985" [5] "Subaru:Legacy AWD Turbo:1993" "Subaru:Loyale:1993" > mm <- paste(a[[47]],":",a[[48]],sep="") > head(mm) [1] "Alfa Romeo:Spider Veloce 2000" "Ferrari:Testarossa" [3] "Dodge:Charger" "Dodge:B150/B250 Wagon 2WD" [5] "Subaru:Legacy AWD Turbo" "Subaru:Loyale" > ed <- gsub('[[:space:]]+', ' ',a[[27]]) > b <- data.frame(id=a[[44]],name1=mm,name2=mmy,make=a[[47]],year=a[[61]],barrels08=a[[1]], + city08=a[[5]],co2TailpipeGpm=a[[15]],comb08=a[[16]],cylinders=a[[23]],displ=a[[24]], + drive=a[[25]],engId=a[[26]],engDscr=ed,fuelCost08=a[[29]],fuelType=a[[31]], + fuelType1=a[[32]],highway08=a[[35]],mpgData=a[[49]],trany=a[[55]],UCity=a[[56]], + UHighway=a[[58]],VClass=a[[60]],youSaveSpend=a[[62]]) > head(b) id name1 name2 make year 1 1 Alfa Romeo:Spider Veloce 2000 Alfa Romeo:Spider Veloce 2000:1985 Alfa Romeo 1985 2 10 Ferrari:Testarossa Ferrari:Testarossa:1985 Ferrari 1985 3 100 Dodge:Charger Dodge:Charger:1985 Dodge 1985 4 1000 Dodge:B150/B250 Wagon 2WD Dodge:B150/B250 Wagon 2WD:1985 Dodge 1985 5 10000 Subaru:Legacy AWD Turbo Subaru:Legacy AWD Turbo:1993 Subaru 1993 6 10001 Subaru:Loyale Subaru:Loyale:1993 Subaru 1993 barrels08 city08 co2TailpipeGpm comb08 cylinders displ drive engId 1 15.68944 19 423.1905 21 4 2.0 Rear-Wheel Drive 9011 2 29.95056 9 807.9091 11 12 4.9 Rear-Wheel Drive 22020 3 12.19557 23 329.1481 27 4 2.2 Front-Wheel Drive 2100 4 29.95056 10 807.9091 11 8 5.2 Rear-Wheel Drive 2850 5 17.33749 17 467.7368 19 4 2.2 4-Wheel or All-Wheel Drive 66031 6 14.96429 21 403.9545 22 4 1.8 Front-Wheel Drive 66020 engDscr fuelCost08 fuelType fuelType1 highway08 mpgData trany UCity 1 (FFS) 2500 Regular Regular Gasoline 25 Y Manual 5-spd 23.3333 2 (GUZZLER) 4750 Regular Regular Gasoline 14 N Manual 5-spd 11.0000 3 (FFS) 1950 Regular Regular Gasoline 33 Y Manual 5-spd 29.0000 4 4750 Regular Regular Gasoline 12 N Automatic 3-spd 12.2222 5 (FFS,TRBO) 3000 Premium Premium Gasoline 23 N Manual 5-spd 21.0000 6 (FFS) 2400 Regular Regular Gasoline 24 N Automatic 3-spd 27.0000 UHighway VClass youSaveSpend 1 35.0000 Two Seaters -1000 2 19.0000 Two Seaters -12250 3 47.0000 Subcompact Cars 1750 4 16.6667 Vans -12250 5 32.0000 Compact Cars -3500 6 33.0000 Compact Cars -500 > write.csv(b,file="fuelData.csv")