I have a table in the AWS Glue catalog that has datatypes of all strings and the files are stored as parquet files in S3. I want to create a Glue job that will simply read the data in from that catalog, partition the files by the date, then write those files back out to S3. However, I keep getting a SchemaColumnConvertNotSupportedException saying that the parquet column cannot be converted.
I have tried the ApplyMapping function to make sure all of the data is represented as strings in the DynamicFrame
import sysfrom awsglue.transforms import *from awsglue.utils import getResolvedOptionsfrom pyspark.context import SparkContextfrom awsglue.context import GlueContextfrom awsglue.job import Jobfrom awsglue.dynamicframe import DynamicFramefrom pyspark.sql.functions import input_file_name## @params: [JOB_NAME]args = getResolvedOptions(sys.argv, ['JOB_NAME'])sc = SparkContext()glueContext = GlueContext(sc)spark = glueContext.spark_sessionjob = Job(glueContext)job.init(args['JOB_NAME'], args)dyf = glueContext.create_dynamic_frame.from_catalog(database = "processed", table_name = "compass_2_daily_tkt_parquet")mapping_dyf = ApplyMapping.apply(frame = tkt, mappings = [("tkt_id", "string", "tkt_id", "string"), ("tkt_nbr", "string", "tkt_nbr", "string"), ("tran_nbr", "string", "tran_nbr", "string"), ("pnr_ref", "string", "pnr_ref", "string"), ("provider_id", "string", "provider_id", "string"), ("carr_nbr", "string", "carr_nbr", "string"), ("agency_nbr", "string", "agency_nbr", "string"), ("point_of_orig_arpt_cd", "string", "point_of_orig_arpt_cd", "string"), ("dt_of_issue", "string", "dt_of_issue", "string"), ("curr_type_cd", "string", "curr_type_cd", "string"), ("fare_amt", "string", "fare_amt", "string"), ("fare_curr_type_cd", "string", "fare_curr_type_cd", "string"), ("tax_amt", "string", "tax_amt", "string"), ("fee_amt", "string", "fee_amt", "string"), ("comm_amt", "string", "comm_amt", "string"), ("doc_amt", "string", "doc_amt", "string"), ("pfc_amt", "string", "pfc_amt", "string"), ("proc_ped", "string", "proc_ped", "string"), ("intl_sale_ind", "string", "intl_sale_ind", "string"), ("e_tkt_ind", "string", "e_tkt_ind", "string"), ("fare_calc_ind", "string", "fare_calc_ind", "string"), ("tour_cd", "string", "tour_cd", "string"), ("dist_channel", "string", "dist_channel", "string"), ("cntry_cd", "string", "cntry_cd", "string"), ("stat_cd", "string", "stat_cd", "string"), ("tran_cd", "string", "tran_cd", "string"), ("data_source_cd", "string", "data_source_cd", "string"), ("data_sharing_id", "string", "data_sharing_id", "string"), ("load_ts", "timestamp", "load_ts", "timestamp"), ("net_fare_amt", "string", "net_fare_amt", "string"), ("suppl_fare_amt", "string", "suppl_fare_amt", "string"), ("contributed_ind", "string", "contributed_ind", "string"), ("print_cpui", "string", "print_cpui", "string"), ("file_id", "string", "file_id", "string"), ("print_fare_basis", "string", "print_fare_basis", "string"), ("waiver_cd", "string", "waiver_cd", "string"), ("comp_conj_cd", "string", "comp_conj_cd", "string"), ("tot_amt", "string", "tot_amt", "string"), ("bulk_ticketing_cd", "string", "bulk_ticketing_cd", "string"), ("equivalent_fare_amt", "string", "equivalent_fare_amt", "string"), ("equivalent_fare_curr_type_cd", "string", "equivalent_fare_curr_type_cd", "string"), ("fare_amt_usd", "string", "fare_amt_usd", "string"), ("tax_amt_usd", "string", "tax_amt_usd", "string"), ("doc_amt_usd", "string", "doc_amt_usd", "string"), ("fare_amt_eur", "string", "fare_amt_eur", "string"), ("tax_amt_eur", "string", "tax_amt_eur", "string"), ("doc_amt_eur", "string", "doc_amt_eur", "string"), ("depart_dt", "string", "depart_dt", "string"), ("trip_type_cd", "string", "trip_type_cd", "string"), ("tkt_dest_cd", "string", "tkt_dest_cd", "string"), ("dds_tkt_rndmzn_pcg", "string", "dds_tkt_rndmzn_pcg", "string")])tkt_df = mapping_dyf.toDF().withColumn("filename", input_file_name())tkt_df.repartition("filename").write.partitionBy("filename").mode("append").parquet("s3://landing-spot")tkt_df.write.parquet("s3://landing-spot", partitionBy=["filename"])datasink = glueContext.write_dynamic_frame.from_options(frame = tkt_dyf, connection_type = "s3", connection_options = {"path": "s3://landing-spot/", "partitionKeys": ["filename"]}, format = "parquet")job.commit()
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedExceptionat org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.constructConvertNotSupportedException(VectorizedColumnReader.java:250)at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBinaryBatch(VectorizedColumnReader.java:497)at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:220)at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:261)at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:159)at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:181)... 28 more2019-08-07 19:57:41,651 ERROR [Executor task launch worker for task 7] executor.Executor (Logging.scala:logError(91)) - Exception in task 6.0 in stage 1.0 (TID 7)org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file s3://file.parquet. Column: [tran_nbr], Expected: double, Found: BINARY
This confuses me since I am applying the mapping to the DynamicFrame, but when the process goes to write the data out to S3, it seems as if AWS Glue tries to infer the schema all over again.