Name: ml-featurizer
Owner: Adobe Systems Incorporated
Description: null
Created: 2018-05-02 21:16:10.0
Updated: 2018-05-19 10:03:04.0
Pushed: 2018-05-05 06:27:57.0
Homepage: null
Size: 21
Language: Scala
GitHub Committers
User | Most Recent Commit | # Commits |
---|
Other Committers
User | Most Recent Commit | # Commits |
---|
Feature engineering is a difficult and time consuming process. ML Featurizer is a library to enable users to create additional features from raw data with ease. It extends and enriches the existing Spark's feature engineering functionality.
ct DayOfWeekFeaturizerExample {
f main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("DayOfWeekFeaturizer").master("local").getOrCreate()
val data = Array((0, "2018-01-02"),
(1, "2018-02-02"),
(2, "2018-03-02"),
(3, "2018-04-05"),
(3, "2018-05-05"))
val dataFrame = spark.createDataFrame(data).toDF("id", "date")
val featurizer = new DayOfWeekFeaturizer()
.setInputCol("date")
.setOutputCol("dayOfWeek")
.setFormat("yyyy-MM-dd")
val featurizedDataFrame = featurizer.transform(dataFrame)
featurizedDataFrame.show()
ct FeaturePipeline {
f main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("FeaturePipeline").master("local").getOrCreate()
val data = Array((0, "2018-01-02", 1.0, 2.0, "mercedes"),
(1, "2018-02-02", 2.5, 3.5, "lexus"),
(2, "2018-03-02", 5.0, 1.0, "toyota"),
(3, "2018-04-05", 8.0, 9.0, "tesla"),
(4, "2018-05-05", 1.0, 5.0, "bmw"),
(4, "2018-05-05", 1.0, 5.0, "bmw"))
val dataFrame = spark.createDataFrame(data).toDF("id", "date", "price1", "price2", "brand")
val dayOfWeekfeaturizer = new DayOfWeekFeaturizer()
.setInputCol("date")
.setOutputCol("dayOfWeek")
.setFormat("yyyy-MM-dd")
val monthOfYearfeaturizer = new MonthOfYearFeaturizer()
.setInputCol("date")
.setOutputCol("monthOfYear")
.setFormat("yyyy-MM-dd")
val weekendFeaturizer = new WeekendFeaturizer()
.setInputCol("date")
.setOutputCol("isWeekend")
.setFormat("yyyy-MM-dd")
val additionFeaturizer = new AdditionFeaturizer()
.setInputCols("price1", "price2")
.setOutputCol("price1_add_price2")
val indexer = new StringIndexer()
.setInputCol("brand")
.setOutputCol("brandIndex")
val encoder = new OneHotEncoder()
.setInputCol("brandIndex")
.setOutputCol("brandVector")
val pipeline = new Pipeline()
.setStages(Array(dayOfWeekfeaturizer, monthOfYearfeaturizer, weekendFeaturizer, additionFeaturizer,
indexer, encoder))
val model = pipeline.fit(dataFrame)
model.transform(dataFrame).show()