here is my python script, to process about 1m files, mainly to check if these files exists:
def check_data_valid(): data_root = "/data_alpha/data/video/internvid/" save_root = "/data_alpha/data/video/internvid/all_file_list/proc" download_list = "/data_alpha/data/video/internvid/all_file_list/all_files.txt" valid_data = [] err_data = [] missing_data = [] with open(download_list, "r") as f: a = f.readlines() downloads = [i.strip() for i in a] for i, item in enumerate(downloads): file = os.path.join(data_root, item) avi = file.replace(".mp4", ".avi") if os.path.exists(avi): valid_data.append(item) elif os.path.exists(file): err_data.append(item) else: missing_data.append(item) if (i) % 200000 == 1: if len(valid_data) > 0: with open(os.path.join(save_root, f"valid_{len(valid_data)}_{i}.txt"), "w") as f: f.writelines("\n".join(valid_data)) if len(err_data) > 0: with open(os.path.join(save_root, f"error_{len(err_data)}_{i+start}.txt"), "w") as f: f.writelines("\n".join(err_data)) if len(missing_data) > 0: with open(os.path.join(save_root, f"missing_{len(missing_data)}_{i}.txt"), "w") as f: f.writelines("\n".join(missing_data)) if len(valid_data) > 0: with open(os.path.join(save_root, f"valid_total.txt"), "w") as f: f.writelines("\n".join(valid_data)) if len(err_data) > 0: with open(os.path.join(save_root, f"error_total.txt"), "w") as f: f.writelines("\n".join(err_data)) if len(missing_data) > 0: with open(os.path.join(save_root, f"missing_total.txt"), "w") as f: f.writelines("\n".join(missing_data))if __name__ == '__main__': check_data_valid()
but after run serveral hours, and processed about 100k files by reviewing the files in save_root. Then this process will turn into status D (by use ps
command to see), which means Uninterruptible Sleep, I wonder why this happens and how to avoid this.