[CARBONDATA-4190] Integrate Carbondata with Spark 3.1.1 version

Why is this PR needed? To integrate Carbondata with Spark3.1.1 What changes were proposed in this PR? Refactored code to add changes to support Spark 3.1.1 along with Spark 2.3 and 2.4 versions Changes: 1. Compile Related Changes 1. New Spark package in MV, Streaming and spark-integration. 2. API wise changes as per spark changes 2. Spark has moved to Proleptic Gregorian Calendar, due to which timestamp related changes in carbondata are also required. 3. Show segment by select command refactor 4. Few Lucene test cases ignored due to the deadlock in spark DAGSchedular, which does not allow it to work. 5. Alter rename: Parser enabled in Carbon and check for carbon 6. doExecuteColumnar() changes in CarbonDataSourceScan.scala 7. char/varchar changes from spark side. 8. Rule name changed in MV 9. In univocity parser, CSVParser version changed. 10. New Configs added in SparkTestQueryExecutor to keep some behaviour same as 2.3 and 2.4 Does this PR introduce any user interface change? No Is any new testcase added? No This closes #4141
apache · Jun 23, 2021 · 8ceb4fd · 8ceb4fd
1 parent 18665cc
commit 8ceb4fd
Show file tree

Hide file tree

Showing 178 changed files with 4,548 additions and 1,650 deletions.
diff --git a/LICENSE b/LICENSE
@@ -210,4 +210,6 @@
    BSD 2-Clause
    ------------
 
-   com.github.luben:zstd-jni
+   com.github.luben:zstd-jni
+
+   com.github.paul-hammant:paranamer
diff --git a/examples/flink/pom.xml b/examples/flink/pom.xml
@@ -30,7 +30,7 @@
   <name>Apache CarbonData :: Flink Examples</name>
 
   <properties>
-    <flink.version>1.1.4</flink.version>
+    <flink.version>1.12.2</flink.version>
     <dev.path>${basedir}/../../dev</dev.path>
   </properties>
 

diff --git a/examples/spark/pom.xml b/examples/spark/pom.xml
@@ -38,12 +38,6 @@
       <groupId>org.apache.carbondata</groupId>
       <artifactId>carbondata-spark_${spark.binary.version}</artifactId>
       <version>${project.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-exec</artifactId>
-        </exclusion>
-      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.httpcomponents</groupId>
@@ -81,6 +75,11 @@
       <artifactId>scalatest_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+    <groupId>com.fasterxml.jackson.core</groupId>
+    <artifactId>jackson-databind</artifactId>
+    <version>${dep.jackson.version}</version>
+    </dependency>
   </dependencies>
 
   <build>
@@ -214,5 +213,12 @@
         <spark.binary.version>2.4</spark.binary.version>
       </properties>
     </profile>
+    <profile>
+      <id>spark-3.1</id>
+      <properties>
+        <spark.binary.version>3.1</spark.binary.version>
+        <dep.jackson.version>2.10.0</dep.jackson.version>
+      </properties>
+    </profile>
   </profiles>
 </project>
diff --git a/...s/spark/src/main/scala/org/apache/carbondata/examples/StreamingWithRowParserExample.scala b/...s/spark/src/main/scala/org/apache/carbondata/examples/StreamingWithRowParserExample.scala
@@ -21,7 +21,7 @@ import java.io.{File, PrintWriter}
 import java.net.ServerSocket
 
 import org.apache.spark.sql.{CarbonEnv, SparkSession}
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamingQuery}
+import org.apache.spark.sql.streaming.{StreamingQuery, Trigger}
 
 import org.apache.carbondata.core.util.path.CarbonTablePath
 import org.apache.carbondata.examples.util.ExampleUtils
@@ -165,7 +165,7 @@ object StreamingWithRowParserExample {
           // Write data from socket stream to carbondata file
           qry = readSocketDF.writeStream
             .format("carbondata")
-            .trigger(ProcessingTime("5 seconds"))
+            .trigger(Trigger.ProcessingTime("5 seconds"))
             .option("checkpointLocation", CarbonTablePath.getStreamingCheckpointDir(tablePath))
             .option("dbName", "default")
             .option("tableName", "stream_table_with_row_parser")

diff --git a/...ples/spark/src/main/scala/org/apache/carbondata/examples/StructuredStreamingExample.scala b/...ples/spark/src/main/scala/org/apache/carbondata/examples/StructuredStreamingExample.scala
@@ -21,7 +21,7 @@ import java.io.{File, PrintWriter}
 import java.net.ServerSocket
 
 import org.apache.spark.sql.{CarbonEnv, SparkSession}
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamingQuery}
+import org.apache.spark.sql.streaming.{StreamingQuery, Trigger}
 
 import org.apache.carbondata.core.metadata.schema.table.CarbonTable
 import org.apache.carbondata.core.util.path.CarbonTablePath
@@ -151,7 +151,7 @@ object StructuredStreamingExample {
           // Write data from socket stream to carbondata file
           qry = readSocketDF.writeStream
             .format("carbondata")
-            .trigger(ProcessingTime("5 seconds"))
+            .trigger(Trigger.ProcessingTime("5 seconds"))
             .option("checkpointLocation",
               CarbonTablePath.getStreamingCheckpointDir(carbonTable.getTablePath))
             .option("dbName", "default")

diff --git a/examples/spark/src/main/scala/org/apache/carbondata/examples/util/ExampleUtils.scala b/examples/spark/src/main/scala/org/apache/carbondata/examples/util/ExampleUtils.scala
@@ -85,6 +85,7 @@ object ExampleUtils {
     } else {
       "local[" + workThreadNum.toString() + "]"
     }
+    // TODO: Analyse the legacy configs and add test cases for non legacy ones
     val spark = SparkSession
       .builder()
       .master(masterUrl)
@@ -93,6 +94,7 @@ object ExampleUtils {
       .config("spark.driver.host", "localhost")
       .config("spark.sql.crossJoin.enabled", "true")
       .config("spark.sql.extensions", "org.apache.spark.sql.CarbonExtensions")
+      .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
       .enableHiveSupport()
       .getOrCreate()
     CarbonEnv.getInstance(spark)

diff --git a/examples/spark/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala b/examples/spark/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala
@@ -104,7 +104,9 @@ class RunExamples extends QueryTest with BeforeAndAfterAll {
     TableLevelCompactionOptionExample.exampleBody(spark)
   }
 
-  test("LuceneIndexExample") {
+  // Below test case ignored due to the Deadlock in spark code
+  // TODO: To be fixed when spark removes deadlock in opensource code.
+  ignore("LuceneIndexExample") {
     LuceneIndexExample.exampleBody(spark)
   }
 

diff --git a/index/examples/pom.xml b/index/examples/pom.xml
@@ -81,9 +81,6 @@
   <profiles>
     <profile>
       <id>spark-2.3</id>
-      <activation>
-        <activeByDefault>true</activeByDefault>
-      </activation>
       <properties>
         <spark.binary.version>2.3</spark.binary.version>
       </properties>
@@ -94,6 +91,12 @@
         <spark.binary.version>2.4</spark.binary.version>
       </properties>
     </profile>
+    <profile>
+      <id>spark-3.1</id>
+      <properties>
+        <spark.binary.version>3.1</spark.binary.version>
+      </properties>
+    </profile>
   </profiles>
 
 </project>
diff --git a/index/secondary-index/pom.xml b/index/secondary-index/pom.xml
@@ -39,12 +39,6 @@
       <groupId>org.apache.carbondata</groupId>
       <artifactId>carbondata-spark_${spark.binary.version}</artifactId>
       <version>${project.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-exec</artifactId>
-        </exclusion>
-      </exclusions>
       <scope>test</scope>
     </dependency>
     <dependency>
@@ -54,7 +48,6 @@
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <version>2.2.1</version>
       <scope>test</scope>
     </dependency>
     <dependency>
@@ -178,6 +171,12 @@
         <spark.binary.version>2.4</spark.binary.version>
       </properties>
     </profile>
+    <profile>
+      <id>spark-3.1</id>
+      <properties>
+        <spark.binary.version>3.1</spark.binary.version>
+      </properties>
+    </profile>
   </profiles>
 
 </project>