Merge pull request #12 from hydrator/feature/add-dependencies-spark-comp

Chengfeng Mao · web-flow · commit 2a88c90067b5 · 2017-11-15T16:34:07.000-08:00
Add dependencies property for spark compute
diff --git a/src/main/java/co/cask/hydrator/plugin/spark/dynamic/ScalaSparkCompute.java b/src/main/java/co/cask/hydrator/plugin/spark/dynamic/ScalaSparkCompute.java
@@ -34,20 +34,21 @@
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.StringWriter;
 import java.lang.reflect.Method;
 import java.lang.reflect.ParameterizedType;
 import java.lang.reflect.Type;
+import java.nio.file.Files;
 import javax.annotation.Nullable;
 
 /**
@@ -96,10 +97,16 @@ public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws Ille
       throw new IllegalArgumentException("Unable to parse output schema " + config.getSchema(), e);
     }
 
-    if (!config.containsMacro("scalaCode") && Boolean.TRUE.equals(config.getDeployCompile())) {
+    if (!config.containsMacro("scalaCode") && !config.containsMacro("dependencies")
+      && Boolean.TRUE.equals(config.getDeployCompile())) {
       SparkInterpreter interpreter = SparkCompilers.createInterpreter();
       if (interpreter != null) {
+        File dir = null;
         try {
+          if (config.getDependencies() != null) {
+            dir = Files.createTempDirectory("sparkprogram").toFile();
+            SparkCompilers.addDependencies(dir, interpreter, config.getDependencies());
+          }
           // We don't need the actual stage name as this only happen in deployment time for compilation check.
           String className = generateClassName("dummy");
           interpreter.compile(generateSourceClass(className));
@@ -114,6 +121,10 @@ public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws Ille
 
         } catch (CompilationFailureException e) {
           throw new IllegalArgumentException(e.getMessage(), e);
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        } finally {
+          SparkCompilers.deleteDir(dir);
         }
       }
     }
@@ -123,8 +134,16 @@ public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws Ille
   public void initialize(SparkExecutionPluginContext context) throws Exception {
     String className = generateClassName(context.getStageName());
     interpreter = context.createSparkInterpreter();
+    File dir = config.getDependencies() == null ? null : Files.createTempDirectory("sparkprogram").toFile();
+    try {
+      if (config.getDependencies() != null) {
+        SparkCompilers.addDependencies(dir, interpreter, config.getDependencies());
+      }
     interpreter.compile(generateSourceClass(className));
     method = getTransformMethod(interpreter.getClassLoader(), className);
+    } finally {
+      SparkCompilers.deleteDir(dir);
+    }
     isDataFrame = method.getParameterTypes()[0].equals(DATAFRAME_TYPE);
     takeContext = method.getParameterTypes().length == 2;
 
@@ -329,6 +348,16 @@ public static final class Config extends PluginConfig {
     @Macro
     private final String scalaCode;
 
+    @Description(
+      "Extra dependencies for the Spark program. " +
+        "It is a ',' separated list of URI for the location of dependency jars. " +
+        "A path can be ended with an asterisk '*' as a wildcard, in which all files with extension '.jar' under the " +
+        "parent path will be included."
+    )
+    @Macro
+    @Nullable
+    private final String dependencies;
+
     @Description("The schema of output objects. If no schema is given, it is assumed that the output schema is " +
       "the same as the input schema.")
     @Nullable
@@ -340,9 +369,11 @@ public static final class Config extends PluginConfig {
     @Nullable
     private final Boolean deployCompile;
 
-    public Config(String scalaCode, @Nullable String schema, @Nullable Boolean deployCompile) {
+    public Config(String scalaCode, @Nullable String schema, @Nullable String dependencies,
+                  @Nullable Boolean deployCompile) {
       this.scalaCode = scalaCode;
       this.schema = schema;
+      this.dependencies = dependencies;
       this.deployCompile = deployCompile;
     }
 
@@ -355,6 +386,11 @@ public String getSchema() {
       return schema;
     }
 
+    @Nullable
+    public String getDependencies() {
+      return dependencies;
+    }
+
     @Nullable
     public Boolean getDeployCompile() {
       return deployCompile;
diff --git a/src/main/java/co/cask/hydrator/plugin/spark/dynamic/ScalaSparkProgram.java b/src/main/java/co/cask/hydrator/plugin/spark/dynamic/ScalaSparkProgram.java
@@ -35,11 +35,7 @@
 import java.io.IOException;
 import java.lang.reflect.Method;
 import java.lang.reflect.Modifier;
-import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.SimpleFileVisitor;
-import java.nio.file.attribute.BasicFileAttributes;
 import java.util.concurrent.Callable;
 import javax.annotation.Nullable;
 
@@ -79,7 +75,7 @@ public ScalaSparkProgram(Config config) throws CompilationFailureException, IOEx
               getMethodCallable(interpreter.getClassLoader(), config.getMainClass(), null);
             }
           } finally {
-            deleteDir(dir);
+            SparkCompilers.deleteDir(dir);
           }
         } finally {
           interpreter.close();
@@ -98,7 +94,7 @@ public void run(JavaSparkExecutionContext sec) throws Exception {
       interpreter.compile(config.getScalaCode());
       getMethodCallable(interpreter.getClassLoader(), config.getMainClass(), sec).call();
     } finally {
-      deleteDir(dir);
+      SparkCompilers.deleteDir(dir);
     }
   }
 
@@ -166,32 +162,6 @@ public Void call() throws Exception {
     }
   }
 
-  /**
-   * Recursively delete a directory.
-   */
-  public static void deleteDir(@Nullable File dir) {
-    if (dir == null) {
-      return;
-    }
-    try {
-      Files.walkFileTree(dir.toPath(), new SimpleFileVisitor<Path>() {
-        @Override
-        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
-          Files.deleteIfExists(file);
-          return FileVisitResult.CONTINUE;
-        }
-
-        @Override
-        public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
-          Files.deleteIfExists(dir);
-          return FileVisitResult.CONTINUE;
-        }
-      });
-    } catch (IOException e) {
-      LOG.warn("Failed to cleanup temporary directory {}", dir, e);
-    }
-  }
-
   /**
    * Plugin configuration
    */
diff --git a/src/main/java/co/cask/hydrator/plugin/spark/dynamic/SparkCompilers.java b/src/main/java/co/cask/hydrator/plugin/spark/dynamic/SparkCompilers.java
@@ -24,6 +24,8 @@
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import scala.Function0;
 import scala.Option$;
 import scala.collection.JavaConversions;
@@ -44,7 +46,10 @@
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -57,6 +62,8 @@
  */
 public final class SparkCompilers {
 
+  private static final Logger LOG = LoggerFactory.getLogger(SparkCompilers.class);
+
   private static final FilenameFilter JAR_FILE_FILTER = new FilenameFilter() {
     @Override
     public boolean accept(File dir, String name) {
@@ -214,4 +221,30 @@ private static void copyPathAndAdd(FileSystem fs, Path from, File dir, Collectio
   private SparkCompilers() {
     // no-op
   }
+
+  /**
+   * Recursively delete a directory.
+   */
+  public static void deleteDir(@Nullable File dir) {
+    if (dir == null) {
+      return;
+    }
+    try {
+      Files.walkFileTree(dir.toPath(), new SimpleFileVisitor<java.nio.file.Path>() {
+        @Override
+        public FileVisitResult visitFile(java.nio.file.Path file, BasicFileAttributes attrs) throws IOException {
+          Files.deleteIfExists(file);
+          return FileVisitResult.CONTINUE;
+        }
+
+        @Override
+        public FileVisitResult postVisitDirectory(java.nio.file.Path dir, IOException exc) throws IOException {
+          Files.deleteIfExists(dir);
+          return FileVisitResult.CONTINUE;
+        }
+      });
+    } catch (IOException e) {
+      LOG.warn("Failed to cleanup temporary directory {}", dir, e);
+    }
+  }
 }
diff --git a/widgets/ScalaSparkCompute-sparkcompute.json b/widgets/ScalaSparkCompute-sparkcompute.json
@@ -15,6 +15,14 @@
             "default": "/**\n * Transforms the provided input Apache Spark RDD or DataFrame into another RDD or DataFrame.\n *\n * The input DataFrame has the same schema as the input schema to this stage and the transform method should return a DataFrame that has the same schema as the output schema setup for this stage.\n * To emit logs, use: \n *     import org.slf4j.LoggerFactory\n *     val logger = LoggerFactory.getLogger('mylogger')\n *     logger.info('Logging')\n *\n *\n * @param input the input DataFrame which has the same schema as the input schema to this stage.\n * @param context a SparkExecutionPluginContext object that can be used to emit zero or more records (using the emitter.emit() method) or errors (using the emitter.emitError() method) \n * @param context an object that provides access to:\n *      1. CDAP Datasets and Streams - context.fromDataset('counts'); or context.fromStream('input');\n *      2. Original Spark Context - context.getSparkContext();\n *      3. Runtime Arguments - context.getArguments.get('priceThreshold')\n */\ndef transform(df: DataFrame, context: SparkExecutionPluginContext) : DataFrame = {\n  df\n}"
           }
         },
+        {
+          "widget-type": "dsv",
+          "label": "Dependencies",
+          "name": "dependencies",
+          "widget-attributes": {
+            "delimiter": ","
+          }
+        },
         {
           "widget-type": "select",
           "label": "Compile at Deployment Time",