Spaces:

aialliance
/

GEO-Bench-2-Leaderboard

Running

App Files Files Community

GEO-Bench-2-Leaderboard / utils /input_validation.py

NaomiS

Initial commit

f977fd5 28 days ago

raw

history blame contribute delete

9.69 kB

	import pandas as pd
	import json
	import os
	import numpy as np
	import uuid
	from utils.constants import (NEW_SUBMISSION_FOLDER, CSV_FILE, JSON_FILE, DIMENSIONS,
	NEW_SUBMISSION_COLUMN_INFO, NEW_SUBMISSION_COLUMN_NAMES,
	JSON_FORMAT, MODEL_INFO_FILE, RESULTS_DIR, REQUIRED_SEEDS)


	def check_correct_file_type(folder_contents) -> bool:
	"""
	checks that folder has 2 items: a csv file and a json file
	"""
	contains_correct_files = (len(folder_contents) == 2) and (CSV_FILE in folder_contents) and (JSON_FILE in folder_contents)
	if not contains_correct_files:
	print("\nInput Validation Error: Please check that the {NEW_SUBMISSION_FOLDER} contains the files: \
	{CSV_FILE} and {JSON_FILE}")
	return False
	return True


	def check_csv_columns_datatypes() -> bool:
	"""
	checks that csv file has only required columns and columns have correct data types
	"""
	#check for correct columns
	csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}")
	submitted_csv_column_names = set(csv_data.columns)
	expected_column_names = set(NEW_SUBMISSION_COLUMN_NAMES)
	for item in expected_column_names:
	if item not in submitted_csv_column_names:
	print(f"The following column is missing: {item}")
	correct_columns = expected_column_names.issubset(submitted_csv_column_names)
	if not correct_columns:
	print(f"\nInput Validation Error: Please ensure that the csv file contains the following columns: {NEW_SUBMISSION_COLUMN_NAMES}")

	#check for correct dtype
	correct_dtypes = []
	for col in NEW_SUBMISSION_COLUMN_INFO["string_cols"]:
	if col in csv_data.columns:
	print(f"{col} is string/object:{pd.api.types.is_object_dtype(csv_data[col])}")
	correct_dtypes.append(pd.api.types.is_object_dtype(csv_data[col]))
	for col in NEW_SUBMISSION_COLUMN_INFO["integer_cols"]:
	if col in csv_data.columns:
	print(f"{col} is numeric: {pd.api.types.is_numeric_dtype(csv_data[col])}")
	correct_dtypes.append(pd.api.types.is_numeric_dtype(csv_data[col]))
	for col in NEW_SUBMISSION_COLUMN_INFO["float_cols"]:
	if col in csv_data.columns:
	print(f"{col} is numeric: {pd.api.types.is_numeric_dtype(csv_data[col])}")
	correct_dtypes.append(pd.api.types.is_numeric_dtype(csv_data[col]))
	correct_dtypes = all(correct_dtypes)
	if not correct_dtypes:
	print(f"\nInput Validation Error: Please ensure that the csv columns have the correct datatypes as follows: \n\
	string/object type columns: {NEW_SUBMISSION_COLUMN_INFO['string_cols']} \n\
	numeric/integer type columns: {NEW_SUBMISSION_COLUMN_INFO['integer_cols']}\
	{NEW_SUBMISSION_COLUMN_INFO['float_cols']}")
	return correct_columns, correct_dtypes


	def check_correct_entries_per_dataset(required_seeds: int = REQUIRED_SEEDS) -> bool:
	"""
	checks for correct number of runs per backbone/dataset combination
	checks for required number of unique seeds
	"""
	csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}")
	count_values = csv_data.groupby(["backbone", "dataset"]).count()
	count_values = list(set(count_values["test metric"].tolist()))
	correct_num_values = (len(count_values) == 1) and (count_values[0] == required_seeds)
	if not correct_num_values:
	print(f"\nInput Validation Error: Please ensure that each backbone/dataset combination has {required_seeds} entries")

	count_seeds = csv_data.groupby(["backbone", "dataset"]).nunique()
	count_seeds = list(set(count_seeds["Seed"].tolist()))
	correct_num_seeds = (len(count_seeds) == 1) and (count_seeds[0] == required_seeds)
	if not correct_num_seeds:
	print(f"\nInput Validation Warning: Please ensure that each backbone/dataset combination has {required_seeds} unique seeds")
	return correct_num_values, correct_num_seeds


	def check_json_keys() -> bool:
	"""
	checks json file has required keys and subkeys,
	check json file values have correct data type
	"""
	with open(f"{NEW_SUBMISSION_FOLDER}/{JSON_FILE}") as f:
	json_submission_data = json.load(f)
	#TBD: check json file nested values have correct data type
	all_required_keys = []
	for key, value in JSON_FORMAT.items():
	if (key in json_submission_data) and (type(value) == type(json_submission_data[key])):
	all_required_keys.append(True)
	else:
	all_required_keys.append(False)
	all_required_keys = all(all_required_keys)
	if not all_required_keys:
	print(f"\nInput Validation Error: Please ensure that json file has the correct keys and datatypes")

	return all_required_keys


	def check_has_atleast_one_dimension() -> bool:
	"""
	check that submission contains datasets required for at least one submission
	"""
	csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}")
	submitted_csv_datasets = set(csv_data["dataset"].tolist())
	contains_atleast_one_dimension = []
	for dimension, datasets in DIMENSIONS.items():
	datasets = set(datasets)
	contains_atleast_one_dimension.append(datasets.issubset(submitted_csv_datasets))
	contains_atleast_one_dimension = any(contains_atleast_one_dimension)
	if not contains_atleast_one_dimension:
	print("\nInput Validation Error: Please check that the submission contains all datasets for one or more dimensions")
	print(f'currently submitted datasets are: {submitted_csv_datasets}')
	return contains_atleast_one_dimension


	def check_has_frozen_or_full_ft() -> bool:
	"""
	check that submission has correct values in frozen_or_full_ft column
	"""
	csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}")
	frozen_or_full_ft = set(csv_data["frozen_or_full_ft"].tolist())
	correct_values = True
	for item in frozen_or_full_ft:
	if not ((item == "frozen") or (item == "full_ft")):
	correct_values = False
	if not correct_values:
	print("\nInput Validation Error: Please check that the frozen_or_full_ft column contains only 'frozen' or 'full_ft'")
	print(f'currently submitted values are: {frozen_or_full_ft}')
	return correct_values

	def update_new_backbones_and_models():
	"""
	checks if backbone exists in model_info.json (used to display results)
	if not, information on the new model is added to the json file
	"""
	with open(f"{NEW_SUBMISSION_FOLDER}/{JSON_FILE}") as f:
	json_submission_data = json.load(f)

	#read model info
	with open(MODEL_INFO_FILE) as f:
	existing_model_info = json.load(f)
	for item in json_submission_data["New model info"]:
	submitted_backbone = item["unique_backbone_key"]
	if submitted_backbone not in existing_model_info["BACKBONE_NAMES"]:
	existing_model_info["BACKBONE_NAMES"][submitted_backbone] = item["model_display_name"]
	existing_model_info["MODEL_SIZE"][submitted_backbone] = item["model_size"]

	#save new information
	with open(MODEL_INFO_FILE, 'w') as fp:
	json.dump(existing_model_info, fp)


	def validate_new_submission() -> bool:
	"""

	"""
	#get folder contents
	if not os.path.exists(NEW_SUBMISSION_FOLDER): return
	folder_contents = os.listdir(NEW_SUBMISSION_FOLDER)
	items_to_ignore = ['.DS_Store']
	for item in items_to_ignore:
	if item in folder_contents: folder_contents.remove(item)
	if len(folder_contents) == 0:
	print("no new submissions")
	return

	#check all conditions
	correct_file_type = check_correct_file_type(folder_contents)
	correct_columns, correct_dtypes = check_csv_columns_datatypes()
	correct_num_values, correct_num_seeds = check_correct_entries_per_dataset()
	correct_json_keys = check_json_keys()
	contains_atleast_one_dimension = check_has_atleast_one_dimension()
	correct_frozen_or_full_ft = check_has_frozen_or_full_ft()
	all_checks_passed = all([correct_file_type, correct_columns, correct_dtypes,
	correct_json_keys, correct_num_values, #correct_num_seeds,
	contains_atleast_one_dimension, correct_frozen_or_full_ft])

	if all_checks_passed:
	submission_id = uuid.uuid4()
	os.makedirs(f"{RESULTS_DIR}/{submission_id}")

	#copy only required keys in json file to new submission folder
	with open(f"{NEW_SUBMISSION_FOLDER}/{JSON_FILE}") as f:
	json_submission_data = json.load(f)
	new_dict = {}
	for key, value in JSON_FORMAT.items():
	if value == "TBD": continue
	new_dict[key] = json_submission_data[key]
	with open(f"{RESULTS_DIR}/{submission_id}/{JSON_FILE}", 'w') as fp:
	json.dump(new_dict, fp)

	#copy only required columns in csv file to new submission folder
	csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}")
	csv_data = csv_data[NEW_SUBMISSION_COLUMN_NAMES]
	csv_data.to_csv(f"{RESULTS_DIR}/{submission_id}/{CSV_FILE}", index=False)

	#add any new model info to model_info.json
	update_new_backbones_and_models()

	#reset NEW_SUBMISSION_FOLDER
	os.system(f"rm -r {NEW_SUBMISSION_FOLDER}/")
	os.makedirs(NEW_SUBMISSION_FOLDER)
	return
	else:
	print("\nThe new sumbission has not been formatted correctly. Please fix the errors above")
	raise ValueError







	if __name__ == "__main__":
	validate_new_submission()