taxi-TF-cvs-classification.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. # https://keras.io/examples/structured_data/structured_data_classification_with_feature_space/
  2. # https://github.com/keras-team/keras-io/blob/master/examples/structured_data/structured_data_classification_with_feature_space.py
  3. import os, numpy
  4. os.environ["KERAS_BACKEND"] = "tensorflow"
  5. import tensorflow as tf
  6. import pandas as pd
  7. import keras
  8. from keras.utils import FeatureSpace
  9. """
  10. ## Preparing the data
  11. Let's download the data and load it into a Pandas dataframe:
  12. """
  13. file_url = "taxis.csv" # "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
  14. dataframe = pd.read_csv(file_url)
  15. dataframe = dataframe.drop(['pickup', 'dropoff'], axis=1)
  16. dataframe = dataframe.dropna()
  17. print(dataframe.head())
  18. # # замена строкового значения на целочисленное для бинарной классификации
  19. # dataframe['target'] = dataframe.loc[dataframe['payment'] =='credit card', 'target'] = 1
  20. # dataframe['target'] = dataframe.loc[dataframe['payment'] =='cash', 'target'] = 0
  21. # creating a dict file
  22. pay = {'credit card': 1,'cash': 0}
  23. dataframe.payment = [pay[item] for item in dataframe.payment]
  24. """
  25. The dataset includes 303 samples with 14 columns per sample
  26. (13 features, plus the target label):
  27. """
  28. print(f"dataframe.shape = {dataframe.shape}")
  29. """
  30. Here's a preview of a few samples:
  31. """
  32. print(dataframe.head(20))
  33. print(dataframe.info())
  34. # passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
  35. # 0 1 1.60 7.0 2.15 0.0 12.95 yellow credit card Lenox Hill West UN/Turtle Bay South Manhattan Manhattan
  36. # 1 1 0.79 5.0 0.00 0.0 9.30 yellow cash Upper West Side South Upper West Side South Manhattan Manhattan
  37. # 2 1 1.37 7.5 2.36 0.0 14.16 yellow credit card Alphabet City West Village Manhattan Manhattan
  38. # 3 1 7.70 27.0 6.15 0.0 36.95 yellow credit card Hudson Sq Yorkville West Manhattan Manhattan
  39. # 4 3 2.16 9.0 1.10 0.0 13.40 yellow credit card Midtown East Yorkville West Manhattan Manhattan
  40. # dataframe.shape = (6341, 12)
  41. # passengers distance fare tip tolls total color payment pickup_zone dropoff_zone pickup_borough dropoff_borough
  42. # 0 1 1.60 7.0 2.15 0.0 12.95 yellow 1 Lenox Hill West UN/Turtle Bay South Manhattan Manhattan
  43. # 1 1 0.79 5.0 0.00 0.0 9.30 yellow 0 Upper West Side South Upper West Side South Manhattan Manhattan
  44. # 2 1 1.37 7.5 2.36 0.0 14.16 yellow 1 Alphabet City West Village Manhattan Manhattan
  45. # 3 1 7.70 27.0 6.15 0.0 36.95 yellow 1 Hudson Sq Yorkville West Manhattan Manhattan
  46. # 4 3 2.16 9.0 1.10 0.0 13.40 yellow 1 Midtown East Yorkville West Manhattan Manhattan
  47. # 5 1 0.49 7.5 2.16 0.0 12.96 yellow 1 Times Sq/Theatre District Midtown East Manhattan Manhattan
  48. # 6 1 3.65 13.0 2.00 0.0 18.80 yellow 1 Battery Park City Two Bridges/Seward Park Manhattan Manhattan
  49. # 8 1 3.63 15.0 1.00 0.0 19.30 yellow 1 East Harlem South Midtown Center Manhattan Manhattan
  50. # 9 1 1.52 8.0 1.00 0.0 13.30 yellow 1 Lincoln Square East Central Park Manhattan Manhattan
  51. # 10 1 3.90 17.0 0.00 0.0 17.80 yellow 0 LaGuardia Airport Astoria Queens Queens
  52. # 11 1 1.53 6.5 2.16 0.0 12.96 yellow 1 Upper West Side South Manhattan Valley Manhattan Manhattan
  53. # 12 1 1.05 6.5 1.00 0.0 11.30 yellow 1 Murray Hill Midtown Center Manhattan Manhattan
  54. # 13 1 1.75 10.5 0.00 0.0 13.80 yellow 0 Lincoln Square West Times Sq/Theatre District Manhattan Manhattan
  55. # 14 0 2.90 11.5 0.00 0.0 14.80 yellow 0 Financial District North Two Bridges/Seward Park Manhattan Manhattan
  56. # 15 3 2.09 13.5 0.00 0.0 16.80 yellow 0 Upper West Side North Clinton East Manhattan Manhattan
  57. # 16 1 2.12 13.0 0.00 0.0 16.30 yellow 0 East Chelsea Meatpacking/West Village West Manhattan Manhattan
  58. # 17 1 2.60 10.5 2.00 0.0 16.30 yellow 1 Midtown Center East Harlem South Manhattan Manhattan
  59. # 18 1 2.18 9.5 1.92 0.0 14.72 yellow 1 Gramercy Midtown Center Manhattan Manhattan
  60. # 19 6 1.08 6.5 1.08 0.0 11.38 yellow 1 East Chelsea East Chelsea Manhattan Manhattan
  61. # 20 1 1.07 6.5 1.54 0.0 11.84 yellow 1 Penn Station/Madison Sq West Kips Bay Manhattan Manhattan
  62. """
  63. The last column, "target", indicates whether the patient
  64. has a heart disease (1) or not (0).
  65. Let's split the data into a training and validation set:
  66. """
  67. val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
  68. train_dataframe = dataframe.drop(val_dataframe.index)
  69. print(
  70. "Using %d samples for training and %d for validation"
  71. % (len(train_dataframe), len(val_dataframe))
  72. )
  73. """
  74. Let's generate `tf.data.Dataset` objects for each dataframe:
  75. """
  76. def dataframe_to_dataset(dataframe):
  77. dataframe = dataframe.copy()
  78. labels = dataframe.pop("payment")
  79. ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  80. ds = ds.shuffle(buffer_size=len(dataframe))
  81. return ds
  82. train_ds = dataframe_to_dataset(train_dataframe)
  83. val_ds = dataframe_to_dataset(val_dataframe)
  84. for x, y in train_ds.take(1):
  85. print("Input:", x)
  86. print("Target:", y)
  87. """
  88. Let's batch the datasets:
  89. """
  90. train_ds = train_ds.batch(32)
  91. val_ds = val_ds.batch(32)
  92. """
  93. ## Configuring a `FeatureSpace`
  94. To configure how each feature should be preprocessed,
  95. we instantiate a `keras.utils.FeatureSpace`, and we
  96. pass to it a dictionary that maps the name of our features
  97. to a string that describes the feature type.
  98. We have a few "integer categorical" features such as `"FBS"`,
  99. one "string categorical" feature (`"thal"`),
  100. and a few numerical features, which we'd like to normalize
  101. -- except `"age"`, which we'd like to discretize into
  102. a number of bins.
  103. We also use the `crosses` argument
  104. to capture *feature interactions* for some categorical
  105. features, that is to say, create additional features
  106. that represent value co-occurrences for these categorical features.
  107. You can compute feature crosses like this for arbitrary sets of
  108. categorical features -- not just tuples of two features.
  109. Because the resulting co-occurences are hashed
  110. into a fixed-sized vector, you don't need to worry about whether
  111. the co-occurence space is too large.
  112. """
  113. feature_space = FeatureSpace(
  114. features={
  115. # Categorical features encoded as integers
  116. "passengers": "integer_categorical",
  117. # Categorical feature encoded as string
  118. "color": "string_categorical",
  119. "pickup_zone": "string_categorical",
  120. "dropoff_zone": "string_categorical",
  121. "pickup_borough": "string_categorical",
  122. "dropoff_borough": "string_categorical",
  123. # Numerical features to discretize
  124. "distance": "float_discretized",
  125. "fare": "float_discretized",
  126. "tip": "float_discretized",
  127. "tolls": "float_discretized",
  128. "total": "float_discretized",
  129. },
  130. # We create additional features by hashing
  131. # value co-occurrences for the
  132. # following groups of categorical features.
  133. crosses=[("pickup_borough", "dropoff_borough"), ("pickup_zone", "tip")],
  134. # The hashing space for these co-occurrences
  135. # wil be 32-dimensional.
  136. crossing_dim=32,
  137. # Our utility will one-hot encode all categorical
  138. # features and concat all features into a single
  139. # vector (one vector per sample).
  140. output_mode="concat",
  141. )
  142. train_ds_with_no_labels = train_ds.map(lambda x, _: x)
  143. feature_space.adapt(train_ds_with_no_labels)
  144. for x, _ in train_ds.take(1):
  145. preprocessed_x = feature_space(x)
  146. print("preprocessed_x.shape:", preprocessed_x.shape)
  147. print("preprocessed_x.dtype:", preprocessed_x.dtype)
  148. """
  149. Let's create a training and validation dataset of preprocessed batches:
  150. """
  151. preprocessed_train_ds = train_ds.map(
  152. lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
  153. )
  154. preprocessed_train_ds = preprocessed_train_ds.prefetch(tf.data.AUTOTUNE)
  155. preprocessed_val_ds = val_ds.map(
  156. lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
  157. )
  158. preprocessed_val_ds = preprocessed_val_ds.prefetch(tf.data.AUTOTUNE)
  159. """
  160. ## Build a model
  161. Time to build a model -- or rather two models:
  162. - A training model that expects preprocessed features (one sample = one vector)
  163. - An inference model that expects raw features (one sample = dict of raw feature values)
  164. """
  165. dict_inputs = feature_space.get_inputs()
  166. encoded_features = feature_space.get_encoded_features()
  167. x = keras.layers.Dense(32, activation="relu")(encoded_features)
  168. x = keras.layers.Dropout(0.5)(x)
  169. predictions = keras.layers.Dense(1, activation="sigmoid")(x)
  170. training_model = keras.Model(inputs=encoded_features, outputs=predictions)
  171. training_model.compile(
  172. optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
  173. )
  174. inference_model = keras.Model(inputs=dict_inputs, outputs=predictions)
  175. """
  176. ## Train the model
  177. Let's train our model for 50 epochs. Note that feature preprocessing is happening
  178. as part of the tf.data pipeline, not as part of the model.
  179. """
  180. training_model.fit(
  181. preprocessed_train_ds,
  182. epochs=20,
  183. validation_data=preprocessed_val_ds,
  184. verbose=2,
  185. )
  186. """
  187. We quickly get to 80% validation accuracy.
  188. """
  189. # save model
  190. # inference_model.save('csvModel.h5')
  191. inference_model.save('TaxiCsvModel.keras')
  192. print('Model Saved!')
  193. # # load model
  194. from tensorflow.keras.models import load_model
  195. savedModel=load_model('TaxiCsvModel.keras')
  196. print(f'savedModel.summary() = {savedModel.summary()}')
  197. """
  198. ## Inference on new data with the end-to-end model
  199. Now, we can use our inference model (which includes the `FeatureSpace`)
  200. to make predictions based on dicts of raw features values, as follows:
  201. """
  202. sample = {
  203. "passengers": 1,
  204. "color": "yellow",
  205. "pickup_zone": "Lenox Hill West",
  206. "dropoff_zone": "UN/Turtle Bay South",
  207. "pickup_borough": "Manhattan",
  208. "dropoff_borough": "Manhattan",
  209. "distance": 1.60,
  210. "fare": 7.0,
  211. "tip": 2.15,
  212. "tolls": 0.0,
  213. "total": 12.95,
  214. }
  215. input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
  216. predictions = inference_model.predict(input_dict)
  217. # predictions = savedModel.predict(input_dict)
  218. print(
  219. f"This passengers had a {100 * predictions[0][0]:.2f}% probability "
  220. "payment with a credit card."
  221. )
  222. predictions = savedModel.predict(input_dict)
  223. print(
  224. f"This passengers had a {100 * predictions[0][0]:.2f}% probability "
  225. "payment with a credit card."
  226. )
  227. # This passengers had a 100.00% probability payment with a credit card.
  228. # This passengers had a 97.98% probability payment with a credit card.