diff --git a/mleap-core/src/main/scala/ml/combust/mleap/core/classification/LightGBMClassifierModel.scala b/mleap-core/src/main/scala/ml/combust/mleap/core/classification/LightGBMClassifierModel.scala new file mode 100644 index 000000000..4ec043c35 --- /dev/null +++ b/mleap-core/src/main/scala/ml/combust/mleap/core/classification/LightGBMClassifierModel.scala @@ -0,0 +1,59 @@ +package ml.combust.mleap.core.classification + +import com.microsoft.ml.spark.lightgbm.LightGBMBooster +import org.apache.spark.ml.linalg.{Vector, Vectors} + +object LightGBMClassifierModel{ + def apply(model: String, + labelColName: String, + featuresColName: String, + predictionColName: String, + probColName: String, + rawPredictionColName: String, + actualNumClasses: Int): LightGBMClassifierModel = LightGBMClassifierModel( + model, labelColName, featuresColName, predictionColName, probColName, rawPredictionColName, + actualNumClasses = actualNumClasses) +} + +case class LightGBMClassifierModel( + override val booster: LightGBMBooster, + override val labelColName: String, + override val featuresColName: String, + override val predictionColName: String, + override val probColName: String, + override val rawPredictionColName: String, + override val thresholdValues: Option[Seq[Double]], + override val actualNumClasses: Int) + extends ProbabilisticClassificationModel with LightGBMClassifierModelBase with Serializable { + override val numClasses: Int = actualNumClasses + + override def rawToProbabilityInPlace(raw: Vector): Vector = { + throw new NotImplementedError("Unexpected error in LightGBMClassificationModel:" + + " raw2probabilityInPlace should not be called!") + } + + override def predictRaw(features: Vector): Vector = { + Vectors.dense(booster.score(features, true, true)) + } + + override def predictProbabilities(features: Vector): Vector = { + Vectors.dense(booster.score(features, false, true)) + } + + override def predict(features: Vector): Double = { + rawToPrediction(predictRaw(features)) + } + + override val numFeatures: Int = 0 +} + +trait LightGBMClassifierModelBase { + def booster: LightGBMBooster + def labelColName: String + def featuresColName: String + def predictionColName: String + def probColName: String + def rawPredictionColName: String + def thresholdValues: Option[Seq[Double]] + def actualNumClasses: Int +} diff --git a/mleap-core/src/main/scala/ml/combust/mleap/core/regression/LightGBMRegressionModel.scala b/mleap-core/src/main/scala/ml/combust/mleap/core/regression/LightGBMRegressionModel.scala new file mode 100644 index 000000000..5aa13d12e --- /dev/null +++ b/mleap-core/src/main/scala/ml/combust/mleap/core/regression/LightGBMRegressionModel.scala @@ -0,0 +1,34 @@ +package ml.combust.mleap.core.regression + +import com.microsoft.ml.spark.lightgbm.LightGBMBooster +import ml.combust.mleap.core.Model +import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} +import org.apache.spark.ml.linalg.{Vector, Vectors} + +object LightGBMRegressionModel{ + def apply(model: String, + featuresColName: String, + predictionColName: String): LightGBMRegressionModel = LightGBMRegressionModel( + model, featuresColName, predictionColName) +} + +case class LightGBMRegressionModel( + override val booster: LightGBMBooster, + override val featuresColName: String, + override val predictionColName: String) + extends LightGBMRegressionModelBase with Model { + + override def inputSchema: StructType = StructType("features" -> TensorType.Double()).get + + override def outputSchema: StructType = StructType("prediction" -> ScalarType.Double.nonNullable).get + + def predict(features: Vector): Double = { + booster.score(features, false, false)(0) + } +} + +trait LightGBMRegressionModelBase { + def booster: LightGBMBooster + def featuresColName: String + def predictionColName: String +} diff --git a/mleap-runtime/src/main/scala/ml/combust/mleap/bundle/ops/classification/LightGBMClassificationModelOp.scala b/mleap-runtime/src/main/scala/ml/combust/mleap/bundle/ops/classification/LightGBMClassificationModelOp.scala new file mode 100644 index 000000000..5f66bbfc6 --- /dev/null +++ b/mleap-runtime/src/main/scala/ml/combust/mleap/bundle/ops/classification/LightGBMClassificationModelOp.scala @@ -0,0 +1,55 @@ +package ml.combust.mleap.bundle.ops.classification + +import com.microsoft.ml.spark.lightgbm.LightGBMBooster +import ml.combust.bundle.BundleContext +import ml.combust.bundle.dsl.{Model, Value} +import ml.combust.bundle.op.OpModel +import ml.combust.mleap.bundle.ops.MleapOp +import ml.combust.mleap.core.classification.LightGBMClassifierModel +import ml.combust.mleap.runtime.MleapContext +import ml.combust.mleap.runtime.transformer.classification.LightGBMClassifier + +class LightGBMClassificationModelOp extends MleapOp[LightGBMClassifier, LightGBMClassifierModel] { + override val Model: OpModel[MleapContext, LightGBMClassifierModel] = + new OpModel[MleapContext, LightGBMClassifierModel] { + override val klazz: Class[LightGBMClassifierModel] = classOf[LightGBMClassifierModel] + + override def opName: String = "lightgbm_classifier" + + override def store(model: Model, obj: LightGBMClassifierModel)( + implicit context: BundleContext[MleapContext]): Model = { + model + .withValue("booster", Value.string(obj.booster.model)) + .withValue("labelColName", Value.string(obj.labelColName)) + .withValue("featuresColName", Value.string(obj.featuresColName)) + .withValue("predictionColName", Value.string(obj.predictionColName)) + .withValue("probColName", Value.string(obj.probColName)) + .withValue( + "rawPredictionColName", + Value.string(obj.rawPredictionColName) + ) + .withValue( + "thresholdValues", + obj.thresholds.map(_.toSeq).map(Value.doubleList) + ).withValue("actualNumClasses", Value.int(obj.numClasses)) + } + + override def load(model: Model)(implicit context: BundleContext[MleapContext]): LightGBMClassifierModel = + { + val booster = new LightGBMBooster(model.value("booster").getString) + new LightGBMClassifierModel( + booster, + model.value("labelColName").getString, + model.value("featuresColName").getString, + model.value("predictionColName").getString, + model.value("probColName").getString, + model.value("rawPredictionColName").getString, + model.getValue("thresholdValues").map(_.getDoubleList.toArray), + model.value("actualNumClasses").getInt + ) + + } + } + + override def model(node: LightGBMClassifier): LightGBMClassifierModel = node.model +} diff --git a/mleap-runtime/src/main/scala/ml/combust/mleap/bundle/ops/regression/LightGBMRegressionModelOp.scala b/mleap-runtime/src/main/scala/ml/combust/mleap/bundle/ops/regression/LightGBMRegressionModelOp.scala new file mode 100644 index 000000000..518689150 --- /dev/null +++ b/mleap-runtime/src/main/scala/ml/combust/mleap/bundle/ops/regression/LightGBMRegressionModelOp.scala @@ -0,0 +1,38 @@ +package ml.combust.mleap.bundle.ops.regression + +import com.microsoft.ml.spark.lightgbm.LightGBMBooster +import ml.combust.bundle.BundleContext +import ml.combust.bundle.dsl.{Model, Value} +import ml.combust.bundle.op.OpModel +import ml.combust.mleap.bundle.ops.MleapOp +import ml.combust.mleap.core.regression.LightGBMRegressionModel +import ml.combust.mleap.runtime.MleapContext +import ml.combust.mleap.runtime.transformer.regression.LightGBMRegression + +class LightGBMRegressionModelOp extends MleapOp[LightGBMRegression, LightGBMRegressionModel] { + override val Model: OpModel[MleapContext, LightGBMRegressionModel] = + new OpModel[MleapContext, LightGBMRegressionModel] { + override val klazz: Class[LightGBMRegressionModel] = classOf[LightGBMRegressionModel] + + override def opName: String = "lightgbm_regression" + + override def store(model: Model, obj: LightGBMRegressionModel)( + implicit context: BundleContext[MleapContext]): Model = { + model + .withValue("booster", Value.string(obj.booster.model)) + .withValue("featuresColName", Value.string(obj.featuresColName)) + .withValue("predictionColName", Value.string(obj.predictionColName)) + } + + override def load(model: Model)(implicit context: BundleContext[MleapContext]): LightGBMRegressionModel = + { + val booster = new LightGBMBooster(model.value("booster").getString) + new LightGBMRegressionModel( + booster, + model.value("featuresColName").getString, + model.value("predictionColName").getString) + } + } + + override def model(node: LightGBMRegression): LightGBMRegressionModel = node.model +} diff --git a/mleap-runtime/src/main/scala/ml/combust/mleap/runtime/transformer/classification/LightGBMClassifier.scala b/mleap-runtime/src/main/scala/ml/combust/mleap/runtime/transformer/classification/LightGBMClassifier.scala new file mode 100644 index 000000000..1b9ec34b3 --- /dev/null +++ b/mleap-runtime/src/main/scala/ml/combust/mleap/runtime/transformer/classification/LightGBMClassifier.scala @@ -0,0 +1,56 @@ +package ml.combust.mleap.runtime.transformer.classification + +import ml.combust.mleap.core.annotation.SparkCode +import ml.combust.mleap.core.classification.LightGBMClassifierModel +import ml.combust.mleap.core.types.NodeShape +import ml.combust.mleap.core.util.VectorConverters._ +import ml.combust.mleap.runtime.frame.{MultiTransformer, Row, Transformer} +import ml.combust.mleap.runtime.function.UserDefinedFunction +import ml.combust.mleap.tensor.Tensor +import org.apache.spark.ml.linalg.{Vector, Vectors} + +@SparkCode(uri = "https://github.com/Azure/mmlspark/blob/f07e5584459e909223a470e6d2e11135b292f3ea/" + + "src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala") +case class LightGBMClassifier(override val uid: String = Transformer.uniqueName("lightgbm_classifier"), + override val shape: NodeShape, + override val model: LightGBMClassifierModel) extends MultiTransformer { + override val exec: UserDefinedFunction = { + val f = (features: Tensor[Double]) => { + + if (model.thresholdValues.isDefined) { + require(model.thresholdValues.get.length == model.numClasses, this.getClass.getSimpleName + + ".transform() called with non-matching numClasses and thresholds.length." + + s" numClasses=$model.numClasses, but thresholds has length ${model.thresholdValues.get.length}") + } + + val rawPrediction: Vector = + if (shape.getOutput("raw_prediction").nonEmpty) + model.predictRaw(features) + else + Vectors.dense(Array.empty[Double]) + + val probability: Vector = + if (shape.getOutput("probability").nonEmpty) + model.predictProbabilities(features) + else + Vectors.dense(Array.empty[Double]) + + val prediction = + if (shape.getOutput("prediction").isDefined) { + if (shape.getOutput("raw_prediction").nonEmpty && model.thresholdValues.isEmpty) { + // Note: Only call raw2prediction if thresholds not defined + model.rawToPrediction(rawPrediction) + } else if (shape.getOutput("prediction").nonEmpty) { + model.probabilityToPrediction(probability) + } else { + model.predict(features) + } + } + else + Double.NaN + + Row(rawPrediction: Tensor[Double], probability: Tensor[Double], prediction) + } + UserDefinedFunction(f, outputSchema, inputSchema) + } +} diff --git a/mleap-runtime/src/main/scala/ml/combust/mleap/runtime/transformer/regression/LightGBMRegression.scala b/mleap-runtime/src/main/scala/ml/combust/mleap/runtime/transformer/regression/LightGBMRegression.scala new file mode 100644 index 000000000..7e2f6672a --- /dev/null +++ b/mleap-runtime/src/main/scala/ml/combust/mleap/runtime/transformer/regression/LightGBMRegression.scala @@ -0,0 +1,20 @@ +package ml.combust.mleap.runtime.transformer.regression + +import ml.combust.mleap.core.regression.LightGBMRegressionModel +import ml.combust.mleap.core.types._ +import ml.combust.mleap.core.util.VectorConverters._ +import ml.combust.mleap.runtime.frame.{MultiTransformer, Row, Transformer} +import ml.combust.mleap.runtime.function.UserDefinedFunction +import ml.combust.mleap.tensor.Tensor + +case class LightGBMRegression(override val uid: String = Transformer.uniqueName("lightgbm_regression"), + override val shape: NodeShape, + override val model: LightGBMRegressionModel) extends MultiTransformer { + override val exec: UserDefinedFunction = { + val f = (features: Tensor[Double]) => { + val prediction = model.predict(features) + Row(prediction: Double) + } + UserDefinedFunction(f, outputSchema, inputSchema) + } +} diff --git a/mleap-spark/src/main/resources/reference.conf b/mleap-spark/src/main/resources/reference.conf index f3e9b86e0..53397a7e6 100644 --- a/mleap-spark/src/main/resources/reference.conf +++ b/mleap-spark/src/main/resources/reference.conf @@ -4,6 +4,7 @@ ml.combust.mleap.spark.registry.builtin-ops = [ "org.apache.spark.ml.bundle.ops.classification.MultiLayerPerceptronClassifierOp", "org.apache.spark.ml.bundle.ops.classification.OneVsRestOp", "org.apache.spark.ml.bundle.ops.classification.RandomForestClassifierOp", + "org.apache.spark.ml.bundle.ops.classification.LightGBMClassificationModelOp", "org.apache.spark.ml.bundle.ops.clustering.GaussianMixtureOp", "org.apache.spark.ml.bundle.ops.clustering.KMeansOp", @@ -42,6 +43,7 @@ ml.combust.mleap.spark.registry.builtin-ops = [ "org.apache.spark.ml.bundle.ops.regression.IsotonicRegressionOp", "org.apache.spark.ml.bundle.ops.regression.LinearRegressionOp", "org.apache.spark.ml.bundle.ops.regression.RandomForestRegressionOp", + "org.apache.spark.ml.bundle.ops.regression.LightGBMRegressionModelOp", "org.apache.spark.ml.bundle.ops.recommendation.ALSOp", diff --git a/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/classification/LightGBMClassificationModelOp.scala b/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/classification/LightGBMClassificationModelOp.scala new file mode 100644 index 000000000..e07c68620 --- /dev/null +++ b/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/classification/LightGBMClassificationModelOp.scala @@ -0,0 +1,84 @@ +package org.apache.spark.ml.bundle.ops.classification + +import com.microsoft.ml.spark.lightgbm.{LightGBMBooster, LightGBMClassificationModel} +import ml.combust.bundle.BundleContext +import ml.combust.bundle.dsl.{Model, NodeShape, Value} +import ml.combust.bundle.op.OpModel +import org.apache.spark.ml.bundle._ + +class LightGBMClassificationModelOp + extends SimpleSparkOp[LightGBMClassificationModel] { + override def sparkInputs(obj: LightGBMClassificationModel): Seq[ParamSpec] = { + Seq("features" -> obj.featuresCol) + } + override def sparkOutputs(obj: LightGBMClassificationModel): Seq[ParamSpec] = { + Seq( + "raw_prediction" -> obj.rawPredictionCol, + "probability" -> obj.probabilityCol, + "prediction" -> obj.predictionCol + ) + } + override def sparkLoad( + uid: String, + shape: NodeShape, + model: LightGBMClassificationModel + ): LightGBMClassificationModel = { + val booster = new LightGBMBooster(model.getModel.model) + new LightGBMClassificationModel( + "", + booster, + model.getLabelCol, + model.getFeaturesCol, + model.getPredictionCol, + model.getProbabilityCol, + model.getRawPredictionCol, + Some(model.getThresholds), + model.numClasses + ) + } + + override val Model: OpModel[SparkBundleContext, LightGBMClassificationModel] = + new OpModel[SparkBundleContext, LightGBMClassificationModel] { + override val klazz: Class[LightGBMClassificationModel] = + classOf[LightGBMClassificationModel] + override def opName: String = "lightgbm_classifier" + override def store(model: Model, obj: LightGBMClassificationModel)( + implicit context: BundleContext[SparkBundleContext] + ): Model = { + assert( + context.context.dataset.isDefined, + BundleHelper.sampleDataframeMessage(klazz) + ) + val thresholds = if (obj.isSet(obj.thresholds)) { + Some(obj.getThresholds) + } else None + + model + .withValue("booster", Value.string(obj.getModel.model)) + .withValue("labelColName", Value.string(obj.getLabelCol)) + .withValue("featuresColName", Value.string(obj.getFeaturesCol)) + .withValue("predictionColName", Value.string(obj.getPredictionCol)) + .withValue("probColName", Value.string(obj.getProbabilityCol)) + .withValue("rawPredictionColName", Value.string(obj.getRawPredictionCol)) + .withValue("thresholdValues", thresholds.map(_.toSeq).map(Value.doubleList)) + .withValue("actualNumClasses", Value.int(obj.numClasses)) + } + + override def load(model: Model)( + implicit context: BundleContext[SparkBundleContext] + ): LightGBMClassificationModel = { + val booster = new LightGBMBooster(model.value("booster").getString) + new LightGBMClassificationModel( + "", + booster, + model.value("labelColName").getString, + model.value("featuresColName").getString, + model.value("predictionColName").getString, + model.value("probColName").getString, + model.value("rawPredictionColName").getString, + model.getValue("thresholdValues").map(_.getDoubleList.toArray), + model.value("actualNumClasses").getInt + ) + } + } +} diff --git a/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/regression/LightGBMRegressionModelOp.scala b/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/regression/LightGBMRegressionModelOp.scala new file mode 100644 index 000000000..9113593ac --- /dev/null +++ b/mleap-spark/src/main/scala/org/apache/spark/ml/bundle/ops/regression/LightGBMRegressionModelOp.scala @@ -0,0 +1,56 @@ +package org.apache.spark.ml.bundle.ops.regression + +import com.microsoft.ml.spark.lightgbm.{LightGBMBooster, LightGBMRegressionModel} +import ml.combust.bundle.BundleContext +import ml.combust.bundle.dsl.{Model, NodeShape, Value} +import ml.combust.bundle.op.OpModel +import org.apache.spark.ml.bundle._ + +class LightGBMRegressionModelOp extends SimpleSparkOp[LightGBMRegressionModel] { + + override def sparkInputs(obj: LightGBMRegressionModel): Seq[ParamSpec] = { + Seq("features" -> obj.featuresCol) + } + override def sparkOutputs(obj: LightGBMRegressionModel): Seq[ParamSpec] = { + Seq("prediction" -> obj.predictionCol) + } + override def sparkLoad( + uid: String, + shape: NodeShape, + model: LightGBMRegressionModel + ): LightGBMRegressionModel = { + val booster = new LightGBMBooster(model.getModel.model) + new LightGBMRegressionModel( + "", + booster, + model.getLabelCol, + model.getFeaturesCol, + model.getPredictionCol + ) + } + + override val Model: OpModel[SparkBundleContext, LightGBMRegressionModel] = + new OpModel[SparkBundleContext, LightGBMRegressionModel] { + override val klazz: Class[LightGBMRegressionModel] = classOf[LightGBMRegressionModel] + override def opName: String = "lightgbm_regression" + override def store(model: Model, obj: LightGBMRegressionModel)( + implicit context: BundleContext[SparkBundleContext]): Model = { + model + .withValue("featuresColName", Value.string(obj.getFeaturesCol)) + .withValue("predictionColName", Value.string(obj.getPredictionCol)) + .withValue("booster", Value.string(obj.getModel.model)) + } + + override def load(model: Model)( + implicit context: BundleContext[SparkBundleContext]): LightGBMRegressionModel = { + val booster = new LightGBMBooster(model.value("booster").getString) + new LightGBMRegressionModel( + "", + booster, + model.value("labelColName").getString, + model.value("featuresColName").getString, + model.value("predictionColName").getString + ) + } + } +}