Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
#FROM docker.io/bitnami/spark:3.1.2
FROM apache/spark:3.5.0-python3
FROM apache/spark:3.5.5-java17-python3
USER 0
RUN apt-get update && \
apt install -y curl vim
ENV SPARK_MASTER local[*]
ENV ZINGG_HOME /zingg-0.6.0
ENV PATH $ZINGG_HOME/scripts:$PATH
ENV LANG C.UTF-8
ENV SPARK_MASTER=local[*]
ENV ZINGG_HOME=/zingg-0.6.0
ENV PATH=$ZINGG_HOME/scripts:$PATH
ENV LANG=C.UTF-8
WORKDIR /
USER root
WORKDIR /zingg-0.6.0
RUN curl --location https://github.com/zinggAI/zingg/releases/download/v0.6.0/zingg-0.6.0-spark-3.5.5.tar.gz | \
tar --extract --gzip --strip=1
tar --extract --gzip --strip=1
RUN pip install -r python/requirements.txt
RUN pip install zingg
RUN chmod -R a+rwx /zingg-0.6.0/models
Expand Down
201 changes: 110 additions & 91 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
</activation>
<properties>
<spark.version>3.4.0</spark.version>
<scala.version>2.12.17</scala.version>
<scala.version>2.12.19</scala.version>
<spark.binary.version>3.4</spark.binary.version>
<scala.binary.version>2.12</scala.binary.version>
<graphframes.version>0.10.0</graphframes.version>
Expand All @@ -56,7 +56,7 @@
</activation>
<properties>
<spark.version>3.5.5</spark.version>
<scala.version>2.12.10</scala.version>
<scala.version>2.12.19</scala.version>
<spark.binary.version>3.5</spark.binary.version>
<scala.binary.version>2.12</scala.binary.version>
<graphframes.version>0.10.0</graphframes.version>
Expand All @@ -67,13 +67,13 @@
<zingg.version>0.6.0</zingg.version>
<skipTests>false</skipTests>
<failIfNoTests>false</failIfNoTests>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<mcheckstyle.version>2.10</mcheckstyle.version>
<mfindbugs.version>2.5.2</mfindbugs.version>
<mjavadoc.version>2.9.1</mjavadoc.version>
<maven-compiler-plugin.version>3.8.1</maven-compiler-plugin.version>
<maven-compiler-plugin.version>3.15.0</maven-compiler-plugin.version>
<maven-jar-plugin.version>2.3.2</maven-jar-plugin.version>
<commons.lang.version>2.3</commons.lang.version>
<shade.plugin.version>2.4.3</shade.plugin.version>
Expand All @@ -89,7 +89,7 @@
<repository>
<id>SparkPackagesRepo</id>
<url>https://repos.spark-packages.org/</url>
</repository>
</repository>
</repositories>

<dependencies>
Expand All @@ -113,7 +113,7 @@
<exclusions>
<exclusion>
<groupId>commons-beanutils</groupId>
<artifactId>commons-beanutils</artifactId>
<artifactId>commons-beanutils</artifactId>
</exclusion>
</exclusions>
</dependency>
Expand Down Expand Up @@ -146,27 +146,27 @@
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_2.12</artifactId>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_2.12</artifactId>
<version>2.15.2</version>
</dependency>

</dependencies>
<build>
<plugins>
Expand All @@ -189,6 +189,10 @@
<source>${maven.compiler.source}</source>
<target>${maven.compiler.target}</target>
<showWarnings>true</showWarnings>
<compilerArgs>
<arg>-Xlint:deprecation</arg>
<arg>-Xlint:unchecked</arg>
</compilerArgs>
</configuration>
</plugin>
<plugin>
Expand Down Expand Up @@ -216,70 +220,85 @@
<usePhrasedClassNameInTestCaseSummary>true</usePhrasedClassNameInTestCaseSummary>
</statelessTestsetInfoReporter>
<parallel>classes</parallel>
<forkCount>4</forkCount>

</configuration>
</plugin>
<forkCount>4</forkCount>
<argLine>
--add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED
--add-opens=java.base/sun.security.action=ALL-UNNAMED
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED
--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED
</argLine>
</configuration>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.6</version>
<executions>
<execution>
<id>prepare-agent</id>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<phase>prepare-package</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.6</version>
<executions>
<execution>
<id>prepare-agent</id>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<phase>prepare-package</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
<inherited>false</inherited>
<executions>
<execution>
<id>install-external-ss</id>
<phase>validate</phase>
<configuration>
<file>${project.basedir}/thirdParty/lib/secondstring.jar</file>
<repositoryLayout>default</repositoryLayout>
<groupId>com.wcohen.ss</groupId>
<artifactId>secondstring</artifactId>
<version>2021</version>
<packaging>jar</packaging>
<generatePom>true</generatePom>
</configuration>
<goals>
<goal>install-file</goal>
</goals>
</execution>
<execution>
<id>install-external-py4j</id>
<phase>validate</phase>
<configuration>
<file>${project.basedir}/thirdParty/lib/py4j0.10.9.jar</file>
<repositoryLayout>default</repositoryLayout>
<groupId>py4j</groupId>
<artifactId>py4j</artifactId>
<version>0.10.9</version>
<packaging>jar</packaging>
<generatePom>true</generatePom>
</configuration>
<goals>
<goal>install-file</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
<inherited>false</inherited>
<executions>
<execution>
<id>install-external-ss</id>
<phase>validate</phase>
<configuration>
<file>${project.basedir}/thirdParty/lib/secondstring.jar</file>
<repositoryLayout>default</repositoryLayout>
<groupId>com.wcohen.ss</groupId>
<artifactId>secondstring</artifactId>
<version>2021</version>
<packaging>jar</packaging>
<generatePom>true</generatePom>
</configuration>
<goals>
<goal>install-file</goal>
</goals>
</execution>
<execution>
<id>install-external-py4j</id>
<phase>validate</phase>
<configuration>
<file>${project.basedir}/thirdParty/lib/py4j0.10.9.jar</file>
<repositoryLayout>default</repositoryLayout>
<groupId>py4j</groupId>
<artifactId>py4j</artifactId>
<version>0.10.9</version>
<packaging>jar</packaging>
<generatePom>true</generatePom>
</configuration>
<goals>
<goal>install-file</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</plugins>
</build>
</project>
15 changes: 7 additions & 8 deletions python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
pandas
seaborn
matplotlib
sphinx
sphinx_rtd_theme
pyspark
numpy

pyspark==3.5.5
numpy==1.24.4
pandas==2.0.3
matplotlib==3.7.5
seaborn==0.13.2
sphinx==7.1.2
sphinx_rtd_theme==2.0.0
17 changes: 16 additions & 1 deletion scripts/zingg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,20 @@ if [[ $RUN_PYTHON_DB_CONNECT_PHASE -eq 1 ]]; then
python $EXECUTABLE
else
# All the additional options must be added here
$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER $PROPERTIES --files "./log4j2.properties" --conf spark.executor.extraJavaOptions="$log4j_setting -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -Xloggc:/tmp/memLog.txt -XX:+UseCompressedOops" --conf spark.driver.extraJavaOptions="$log4j_setting" $LOGGING --driver-class-path $ZINGG_JARS $EXECUTABLE $@ --email $EMAIL --license $LICENSE
GC_LOG_DIR="${GC_LOG_DIR:-/tmp}"
GC_LOG_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=${GC_LOG_DIR}/heapdump-%p.hprof -Xlog:gc*:file=${GC_LOG_DIR}/memLog.txt:time,uptime,level,tags"
export SPARK_SUBMIT_OPTS="${SPARK_SUBMIT_OPTS:-} $log4j_setting $GC_LOG_OPTS"
export SPARK_DRIVER_MEMORY="${SPARK_DRIVER_MEMORY:-4g}"
$SPARK_HOME/bin/spark-submit \
--master "$SPARK_MASTER" \
$PROPERTIES \
--files "./log4j2.properties" \
--driver-java-options "$SPARK_JAVA_OPTS" \
--conf "spark.executor.extraJavaOptions=$SPARK_JAVA_OPTS" \
$LOGGING \
--driver-class-path "$ZINGG_JARS" \
"$EXECUTABLE" \
"$@" \
--email "$EMAIL" \
--license "$LICENSE"
fi