let licenseClassRemapper = {
  commercial: "Commercial",
  unspecified: "Unspecified",
  "non-commercial": "Non-Commercial/Academic",
  "academic-only": "Non-Commercial/Academic",
  unclear: "Non-Commercial/Academic",
};

function roundToOneDecimalPlace(number) {
  return Math.round(number * 10) / 10;
}

export default function prepareDataSummary(data) {
  // CLEAN/FORMAT THE DATA
  const ddata = Object.values(data);

  // Map the dataset name to its object to easily find and update it
  let accumulatedLanguages = {};
  ddata.forEach((dataset) => {
    const datasetName = dataset["Dataset Name"];
    if (!accumulatedLanguages[datasetName]) {
      accumulatedLanguages[datasetName] = [];
    }
    accumulatedLanguages[datasetName].push(...dataset.Languages);
  });

  let clean = [];
  let seenNames = new Set();
  ddata.forEach((dataset) => {
    const datasetName = dataset["Dataset Name"];

    // Check if dataset name is already seen, if so skip this iteration
    // This is for variants of the same dataset which may have different languages
    // but not different tasks, topics, or other metadata.
    if (seenNames.has(dataset["Dataset Name"])) return;

    // Otherwise, add the name to the set
    seenNames.add(dataset["Dataset Name"]);

    // If we haven't seen this dataset name yet, process it fully as before
    var obj = {};
    obj["datasetName"] = datasetName;
    obj["collection"] = dataset.Collection;
    obj["languages"] = accumulatedLanguages[datasetName];
    obj["tasks"] = Array.from(dataset["Task Categories"]);
    obj["textSources"] = Array.from(dataset["Text Sources"]);
    // obj['textDomains'] = Array.from(dataset["Text Domains"]);
    obj["creators"] = Array.from(dataset["Creators"]);

    obj["licenseUseClass"] = dataset["License Use (DataProvenance)"];
    obj["licenseUseCategory"] =
      licenseClassRemapper[dataset["License Use (DataProvenance)"]] ||
      dataset["License Use (DataProvenance)"];
    obj["synthetic"] =
      Array.from(dataset["Model Generated"]).length > 0
        ? "Synthetic"
        : "Regular";
    obj["modelGenerated"] = Array.from(dataset["Model Generated"]);

    const models = [
      "OpenAI GPT-3",
      "OpenAI ChatGPT",
      "OpenAI GPT-4",
      "OpenAI Codex",
    ];
    if (Array.from(dataset["Model Generated"]).length > 0) {
      if (models.includes(dataset["Model Generated"][0])) {
        obj["syntheticClass"] =
          "Synthetic (" + dataset["Model Generated"][0] + ")";
      } else {
        obj["syntheticClass"] = "Synthetic (Other)";
      }
    } else {
      obj["syntheticClass"] = "Regular";
    }

    obj["textTopics"] = dataset?.["Inferred Metadata"]?.["Text Topics"] ?? [];

    obj["cd_frequency"] = 0; // frequency of citation count, download count pair
    obj["citationCount"] =
      dataset?.["Inferred Metadata"]?.["S2 Citation Count (June 2023)"] ?? 0;
    obj["downloadCount"] =
      dataset?.["Inferred Metadata"]?.["HF Downloads (June 2023)"] ?? 0;

    obj["inputTextLen"] =
      roundToOneDecimalPlace(
        dataset?.["Text Metrics"]?.["Mean Inputs Length"],
      ) ?? 0;
    obj["targetTextLen"] =
      roundToOneDecimalPlace(
        dataset?.["Text Metrics"]?.["Mean Targets Length"],
      ) ?? 0;

    obj["pwcDate"] = dataset?.["Inferred Metadata"]?.["PwC Date"] ?? "1900-1-1";
    obj["ssDate"] = dataset?.["Inferred Metadata"]?.["S2 Date"] ?? "1900-1-1";
    obj["date"] = new Date(
      obj["pwcDate"] < obj["ssDate"] ? obj["pwcDate"] : obj["ssDate"],
    );

    obj["hfLink"] = dataset["Hugging Face URL"];

    // Add the dataset object to our map
    seenNames[datasetName] = obj;

    clean.push(obj);
  });
  return clean;
}
