I'm new to Pyspark so just learning as I go.
I'm trying to experiment with UnitTest and I am getting some error below:
def drop_duplicates(df): df = df.dropDuplicates(df) return dfimport unittestclass TestNotebook(unittest.TestCase): def test_drop_duplicates(self): data = ( ['1', '2020-01-01 00:00:00', '2020-01-01 01:00:00', '2', '1'], ['1', '2020-01-01 00:00:00', '2020-01-01 01:00:00', '3', '1'], ['1', '2020-01-01 00:00:00', '2020-01-01 01:00:00', '2', '2'], ['2', '2020-01-01 00:00:00', '2020-01-01 01:00:00', '2', '1'] ) columns = ["ID", "TimeFrom", "TimeTo", "Serial", "Code"] df = spark.createDataFrame(data, columns) expected_data = [ ('1', '2020-01-01 00:00:00', '2020-01-01 01:00:00', '2', '1'), ('1', '2020-01-01 00:00:00', '2020-01-01 01:00:00', '2', '2') ] self.assertEqual(drop_duplicates(df), expected_data)res = unittest.main(argv=[''], verbosity=2, exit=False)(The Assert might fail but I'll know when I move past this error) but for now I just get the following error:
File "/tmp/ipykernel_15937/2907449366.py", line 2, in drop_duplicates df = df.dropDuplicates(df) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/spark/python/pyspark/sql/dataframe.py", line 4207, in dropDuplicates raise PySparkTypeError(pyspark.errors.exceptions.base.PySparkTypeError: [NOT_LIST_OR_TUPLE] Argument `subset` should be a list or tuple, got DataFrame.Is there something I am missing? I'm reading the docs on this method but can't seem to figure out.