import pandas as pd
import numpy as np
import re

autos = pd.read_csv('autos.csv', encoding = 'Latin-1')
autos.head()


autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   dateCrawled          371528 non-null  object
 1   name                 371528 non-null  object
 2   seller               371528 non-null  object
 3   offerType            371528 non-null  object
 4   price                371528 non-null  int64 
 5   abtest               371528 non-null  object
 6   vehicleType          333659 non-null  object
 7   yearOfRegistration   371528 non-null  int64 
 8   gearbox              351319 non-null  object
 9   powerPS              371528 non-null  int64 
 10  model                351044 non-null  object
 11  kilometer            371528 non-null  int64 
 12  monthOfRegistration  371528 non-null  int64 
 13  fuelType             338142 non-null  object
 14  brand                371528 non-null  object
 15  notRepairedDamage    299468 non-null  object
 16  dateCreated          371528 non-null  object
 17  nrOfPictures         371528 non-null  int64 
 18  postalCode           371528 non-null  int64 
 19  lastSeen             371528 non-null  object
dtypes: int64(7), object(13)
memory usage: 56.7+ MB


autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')


def cleaned_cols(col):
    col = col.replace('yearOfRegistration','registration_year')
    col = col.replace('monthOfRegistration','registration_month')
    col = col.replace('notRepairedDamage','unrepaired_damage')
    col = col.replace('dateCreated','ad_created')
    # Change the rest of the column names from camelcase to snakecase
    col = re.sub( r'(?<!^)(?=[A-Z])', '_', col ).lower()
    return col

autos.columns = [cleaned_cols(c) for c in autos.columns]

autos.columns

Index(['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
       'vehicle_type', 'registration_year', 'gearbox', 'power_p_s', 'model',
       'kilometer', 'registration_month', 'fuel_type', 'brand',
       'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code',
       'last_seen'],
      dtype='object')


# Basic data exploration to see what other cleaning tasks need to be done
autos.describe(include = 'all')


autos['price'].head()

0      480
1    18300
2     9800
3     1500
4     3600
Name: price, dtype: int64


# def convert_price(column):
#     if pd.isnull(column) == True:
#         return None
#     else:
#         string = column.split(' ')[0]
#         string = string.replace('$','')
#         string = string.replace(',','')
#         return int(string)
    
# price = autos['price'].apply(convert_price)
# autos['price'] = price
autos['price'].head()

0      480
1    18300
2     9800
3     1500
4     3600
Name: price, dtype: int64


autos['kilometer'].head()

0    150000
1    125000
2    125000
3    150000
4     90000
Name: kilometer, dtype: int64


#def convert_odometer(column):
#    if pd.isnull(column) == True:
#        return None
#    else:
#        string = column.split(' ')[0]
#        string = string.replace(',','')
#        string = string.replace('km','')
#        return int(string)
    
#odometer = autos['kilometer'].apply(convert_odometer)
#autos['kilometer'] = odometer
autos['kilometer'].head()

0    150000
1    125000
2    125000
3    150000
4     90000
Name: kilometer, dtype: int64


# Now we drop the rows that won't be required for any analyis
autos = autos.drop(["nr_of_pictures", "seller", "offer_type"], axis=1)


 print(autos['kilometer'].unique().shape)
 print(autos['kilometer'].describe())
 print(autos['kilometer'].value_counts())

(13,)
count    371528.000000
mean     125618.688228
std       40112.337051
min        5000.000000
25%      125000.000000
50%      150000.000000
75%      150000.000000
max      150000.000000
Name: kilometer, dtype: float64
150000    240797
125000     38067
100000     15920
90000      12523
80000      11053
70000       9773
60000       8669
50000       7615
5000        7069
40000       6376
30000       6041
20000       5676
10000       1949
Name: kilometer, dtype: int64


print(autos['price'].unique().shape)
print(autos['price'].describe())
print(autos['price'].value_counts().head(15))

(5597,)
count    3.715280e+05
mean     1.729514e+04
std      3.587954e+06
min      0.000000e+00
25%      1.150000e+03
50%      2.950000e+03
75%      7.200000e+03
max      2.147484e+09
Name: price, dtype: float64
0       10778
500      5670
1500     5394
1000     4649
1200     4594
2500     4438
600      3819
3500     3792
800      3784
2000     3432
999      3364
750      3203
650      3150
4500     3053
850      2946
Name: price, dtype: int64


autos['price'].value_counts().sort_index(ascending = False).head(15)

2147483647     1
99999999      15
99000000       1
74185296       1
32545461       1
27322222       1
14000500       1
12345678       9
11111111      10
10010011       1
10000000       8
9999999        3
3895000        1
3890000        1
2995000        1
Name: price, dtype: int64


autos['price'].value_counts().sort_index(ascending = True).head(15)

0     10778
1      1189
2        12
3         8
4         1
5        26
7         3
8         9
9         8
10       84
11        5
12        8
13        7
14        5
15       27
Name: price, dtype: int64


autos = autos[autos['price'].between(1, 350000)]
autos['price'].describe()

count    360635.000000
mean       5898.671956
std        8866.359669
min           1.000000
25%        1250.000000
50%        3000.000000
75%        7490.000000
max      350000.000000
Name: price, dtype: float64


autos[['date_crawled','ad_created','last_seen']][0:5]


# We are just looking for the dates so we'll take the first 10 characters in each column
(autos['date_crawled']
        .str[:10]
        .value_counts(normalize = True, dropna = False)
        .sort_index()
        )

2016-03-05    0.025547
2016-03-06    0.014483
2016-03-07    0.035657
2016-03-08    0.033469
2016-03-09    0.034115
2016-03-10    0.032645
2016-03-11    0.032773
2016-03-12    0.036242
2016-03-13    0.015783
2016-03-14    0.036330
2016-03-15    0.033424
2016-03-16    0.030205
2016-03-17    0.031647
2016-03-18    0.013119
2016-03-19    0.035271
2016-03-20    0.036400
2016-03-21    0.035682
2016-03-22    0.032493
2016-03-23    0.032002
2016-03-24    0.029914
2016-03-25    0.032800
2016-03-26    0.031974
2016-03-27    0.030227
2016-03-28    0.035063
2016-03-29    0.034126
2016-03-30    0.033535
2016-03-31    0.031872
2016-04-01    0.034145
2016-04-02    0.035094
2016-04-03    0.038812
2016-04-04    0.037628
2016-04-05    0.012780
2016-04-06    0.003128
2016-04-07    0.001617
Name: date_crawled, dtype: float64


(autos['ad_created']
        .str[:10]
        .value_counts(normalize = True, dropna = False)
        .sort_index()
        )

2014-03-10    0.000003
2015-03-20    0.000003
2015-06-11    0.000003
2015-06-18    0.000003
2015-08-07    0.000003
                ...   
2016-04-03    0.039001
2016-04-04    0.037736
2016-04-05    0.011613
2016-04-06    0.003119
2016-04-07    0.001553
Name: ad_created, Length: 114, dtype: float64


(autos['last_seen']
        .str[:10]
        .value_counts(normalize=True, dropna=False)
        .sort_index()
        )

2016-03-05    0.001292
2016-03-06    0.004134
2016-03-07    0.005262
2016-03-18    0.006931
2016-03-08    0.008056
2016-03-13    0.008489
2016-03-09    0.009994
2016-03-10    0.011563
2016-03-14    0.012301
2016-03-11    0.013046
2016-03-26    0.016160
2016-03-19    0.016314
2016-03-15    0.016411
2016-03-16    0.016419
2016-03-27    0.016909
2016-03-23    0.018149
2016-03-25    0.019097
2016-03-24    0.019237
2016-03-20    0.019915
2016-03-21    0.020136
2016-03-22    0.020607
2016-03-28    0.022273
2016-03-29    0.023312
2016-03-12    0.023401
2016-03-30    0.023856
2016-04-01    0.024022
2016-03-31    0.024238
2016-04-02    0.025016
2016-04-03    0.025366
2016-04-04    0.025654
2016-03-17    0.028760
2016-04-05    0.126206
2016-04-07    0.129648
2016-04-06    0.217830
Name: last_seen, dtype: float64


(autos['registration_year'].between(1900,2016)).sum() / autos.shape[0]

0.9612489081758565


autos = autos[autos['registration_year'].between(1900, 2016)]
autos['registration_year'].value_counts(normalize = True).head(10)

2000    0.066699
1999    0.063552
2005    0.062669
2006    0.057708
2001    0.056955
2003    0.056557
2004    0.056173
2002    0.054290
2007    0.050499
1998    0.049691
Name: registration_year, dtype: float64


autos['brand'].value_counts(normalize = True)

volkswagen        0.211700
bmw               0.109871
opel              0.106410
mercedes_benz     0.096841
audi              0.089543
ford              0.068918
renault           0.047516
peugeot           0.030153
fiat              0.025691
seat              0.018661
skoda             0.015687
mazda             0.015384
smart             0.014331
citroen           0.013950
nissan            0.013598
toyota            0.012932
hyundai           0.009972
sonstige_autos    0.009493
mini              0.009384
volvo             0.009147
mitsubishi        0.008236
honda             0.007532
kia               0.006915
suzuki            0.006364
alfa_romeo        0.006309
porsche           0.006211
chevrolet         0.005022
chrysler          0.003863
dacia             0.002495
jeep              0.002192
land_rover        0.002166
daihatsu          0.002161
subaru            0.002117
jaguar            0.001734
saab              0.001465
daewoo            0.001457
trabant           0.001408
lancia            0.001301
rover             0.001272
lada              0.000597
Name: brand, dtype: float64


brands = autos['brand'].value_counts(normalize = True)
most_common_brands = brands[brands > 0.05].index
most_common_brands

Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford'], dtype='object')


brands_mean_price = {}

for brands in most_common_brands:
    brand_only = autos[autos['brand'] == brands]
    mean_price = brand_only['price'].mean()
    brands_mean_price[brands] = int(mean_price)
    
brands_mean_price

{'volkswagen': 14533,
 'bmw': 14844,
 'opel': 3223,
 'mercedes_benz': 17244,
 'audi': 15868,
 'ford': 8462}


brand_mean_mileage = {}

for brands in most_common_brands:
     brand_only = autos[autos['brand'] == brands]
     mean_mileage = brand_only['kilometer'].mean()
     brand_mean_mileage[brands] = int(mean_mileage)
    
brand_mean_mileage

{'volkswagen': 128575,
 'bmw': 132763,
 'opel': 128906,
 'mercedes_benz': 130664,
 'audi': 129717,
 'ford': 123839}


mean_mileage = pd.Series(brand_mean_mileage).sort_values(ascending = False)
mean_price = pd.Series(brands_mean_price).sort_values(ascending = False)


top_brand_info = pd.DataFrame(mean_mileage, columns = ['mean_mileage'])
top_brand_info


top_brand_info['mean_price'] = mean_price
top_brand_info


autos['gearbox'].value_counts()

manuell      257012
automatik     73725
Name: gearbox, dtype: int64


autos['unrepaired_damage'].value_counts()

nein    252767
ja       33188
Name: unrepaired_damage, dtype: int64


autos['fuel_type'].value_counts()

benzin     212127
diesel     103815
lpg          5090
cng           539
hybrid        265
andere        167
elektro       101
Name: fuel_type, dtype: int64


autos['vehicle_type'].value_counts()

limousine     93826
kleinwagen    78200
kombi         66085
bus           29764
cabrio        22550
coupe         18447
suv           14499
andere         3150
Name: vehicle_type, dtype: int64


gearbox = {
    'manuell': 'manual',
    'automatik': 'automatic'
}

unrepaired_damage = {
    'nein': 'no',
    'ja': 'yes'
}

autos['gearbox'] = autos['gearbox'].map(gearbox)
autos['unrepaired_damage'] = autos['unrepaired_damage'].map(unrepaired_damage)
autos['fuel_type'] = autos['fuel_type'].replace(['benzin', 'elektro', 'andere'], ['gasoline', 'electric', 'other'])
autos['vehicle_type'] = autos['vehicle_type'].replace(['limousine', 'kleinwagen', 'kombi', 'cabrio', 'andere'], ['sedan', 'small car', 'stationwagen', 'convertible', 'other'])
autos['model'] = autos['model'].replace('andere', 'other')

autos.head()


no_yes = ['no', 'yes']

damaged_prices = {}

for i in no_yes:
    damage = autos[autos['unrepaired_damage'] == i]
    mean_price = damage['price'].mean()
    damaged_prices[i] = int(mean_price)
    
damaged_prices

{'no': 7145, 'yes': 2236}

	date_crawled	name	seller	offer_type	price	abtest	vehicle_type	registration_year	gearbox	power_p_s	model	kilometer	registration_month	fuel_type	brand	unrepaired_damage	ad_created	nr_of_pictures	postal_code	last_seen
count	371528	371528	371528	371528	3.715280e+05	371528	333659	371528.000000	351319	371528.000000	351044	371528.000000	371528.000000	338142	371528	299468	371528	371528.0	371528.00000	371528
unique	280500	233531	2	2	NaN	2	8	NaN	2	NaN	251	NaN	NaN	7	40	2	114	NaN	NaN	182806
top	2016-03-24 14:49:47	Ford_Fiesta	privat	Angebot	NaN	test	limousine	NaN	manuell	NaN	golf	NaN	NaN	benzin	volkswagen	nein	2016-04-03 00:00:00	NaN	NaN	2016-04-07 06:45:59
freq	7	657	371525	371516	NaN	192585	95894	NaN	274214	NaN	30070	NaN	NaN	223857	79640	263182	14450	NaN	NaN	17
mean	NaN	NaN	NaN	NaN	1.729514e+04	NaN	NaN	2004.577997	NaN	115.549477	NaN	125618.688228	5.734445	NaN	NaN	NaN	NaN	0.0	50820.66764	NaN
std	NaN	NaN	NaN	NaN	3.587954e+06	NaN	NaN	92.866598	NaN	192.139578	NaN	40112.337051	3.712412	NaN	NaN	NaN	NaN	0.0	25799.08247	NaN
min	NaN	NaN	NaN	NaN	0.000000e+00	NaN	NaN	1000.000000	NaN	0.000000	NaN	5000.000000	0.000000	NaN	NaN	NaN	NaN	0.0	1067.00000	NaN
25%	NaN	NaN	NaN	NaN	1.150000e+03	NaN	NaN	1999.000000	NaN	70.000000	NaN	125000.000000	3.000000	NaN	NaN	NaN	NaN	0.0	30459.00000	NaN
50%	NaN	NaN	NaN	NaN	2.950000e+03	NaN	NaN	2003.000000	NaN	105.000000	NaN	150000.000000	6.000000	NaN	NaN	NaN	NaN	0.0	49610.00000	NaN
75%	NaN	NaN	NaN	NaN	7.200000e+03	NaN	NaN	2008.000000	NaN	150.000000	NaN	150000.000000	9.000000	NaN	NaN	NaN	NaN	0.0	71546.00000	NaN
max	NaN	NaN	NaN	NaN	2.147484e+09	NaN	NaN	9999.000000	NaN	20000.000000	NaN	150000.000000	12.000000	NaN	NaN	NaN	NaN	0.0	99998.00000	NaN

	date_crawled	ad_created	last_seen
0	2016-03-24 11:52:17	2016-03-24 00:00:00	2016-04-07 03:16:57
1	2016-03-24 10:58:45	2016-03-24 00:00:00	2016-04-07 01:46:50
2	2016-03-14 12:52:21	2016-03-14 00:00:00	2016-04-05 12:47:46
3	2016-03-17 16:54:04	2016-03-17 00:00:00	2016-03-17 17:40:17
4	2016-03-31 17:25:20	2016-03-31 00:00:00	2016-04-06 10:17:21

	mean_mileage	mean_price
bmw	132800	8449
mercedes_benz	130572	8551
audi	129443	9086
opel	128722	2971
volkswagen	128386	5400
ford	123662	3696

Exploring eBay Car Sales Data¶

Words in the end and to the future me¶

	dateCrawled	name	seller	offerType	price	abtest	vehicleType	yearOfRegistration	gearbox	powerPS	model	kilometer	monthOfRegistration	fuelType	brand	notRepairedDamage	dateCreated	postalCode	lastSeen
0	2016-03-24 11:52:17	Golf_3_1.6	privat	Angebot	480	test	NaN	1993	manuell	0	golf	150000	0	benzin	volkswagen	NaN	2016-03-24 00:00:00	70435	2016-04-07 03:16:57
1	2016-03-24 10:58:45	A5_Sportback_2.7_Tdi	privat	Angebot	18300	test	coupe	2011	manuell	190	NaN	125000	5	diesel	audi	ja	2016-03-24 00:00:00	66954	2016-04-07 01:46:50
2	2016-03-14 12:52:21	Jeep_Grand_Cherokee_"Overland"	privat	Angebot	9800	test	suv	2004	automatik	163	grand	125000	8	diesel	jeep	NaN	2016-03-14 00:00:00	90480	2016-04-05 12:47:46
3	2016-03-17 16:54:04	GOLF_4_1_4__3TÜRER	privat	Angebot	1500	test	kleinwagen	2001	manuell	75	golf	150000	6	benzin	volkswagen	nein	2016-03-17 00:00:00	91074	2016-03-17 17:40:17
4	2016-03-31 17:25:20	Skoda_Fabia_1.4_TDI_PD_Classic	privat	Angebot	3600	test	kleinwagen	2008	manuell	69	fabia	90000	7	diesel	skoda	nein	2016-03-31 00:00:00	60437	2016-04-06 10:17:21

	mean_mileage
bmw	132763
mercedes_benz	130664
audi	129717
opel	128906
volkswagen	128575
ford	123839