| | |
| | import json |
| | import requests |
| | import matplotlib.pyplot as plt |
| | from datasets import load_dataset |
| |
|
| | |
| | def cargar_modalidades_tareas(): |
| | try: |
| | with open("modalidades_tareas.json", "r") as file: |
| | return json.load(file) |
| | except FileNotFoundError: |
| | return {} |
| |
|
| | def actualizar_modalidades_tareas_desde_huggingface(): |
| | MODALIDADES = ["text-classification", "image-classification", "speech-recognition"] |
| | MODALIDAD_TAREAS = cargar_modalidades_tareas() |
| | |
| | for task in MODALIDADES: |
| | response = requests.get(f"https://huggingface.co/api/datasets?task={task}&full=true&limit=5").json() |
| | |
| | for dataset in response: |
| | dataset_id = dataset["id"] |
| | dataset_info = requests.get(f"https://huggingface.co/api/datasets/{dataset_id}").json() |
| | |
| | |
| | if task not in MODALIDAD_TAREAS: |
| | MODALIDAD_TAREAS[task] = { |
| | "nombre": task.replace("-", " ").capitalize(), |
| | "columnas": list(dataset_info.get("features", {}).keys()), |
| | "datasets": {} |
| | } |
| | |
| | MODALIDAD_TAREAS[task]["datasets"][dataset_id] = { |
| | "columnas": list(dataset_info.get("features", {}).keys()), |
| | "licencia": dataset.get("license", "unknown") |
| | } |
| | |
| | |
| | with open("modalidades_tareas.json", "w") as file: |
| | json.dump(MODALIDAD_TAREAS, file, indent=4) |
| | |
| | return list(MODALIDAD_TAREAS.keys()) |
| |
|
| | |
| | def generar_grafica_barras(tareas_seleccionadas, MODALIDAD_TAREAS): |
| | try: |
| | conteo = {} |
| | for modalidad, datos in MODALIDAD_TAREAS.items(): |
| | tareas_modalidad = datos["tareas"].keys() |
| | conteo[modalidad] = len([t for t in tareas_seleccionadas if t in tareas_modalidad]) |
| | |
| | |
| | fig, ax = plt.subplots(figsize=(10, 6)) |
| | ax.barh(list(conteo.keys()), list(conteo.values()), color='skyblue') |
| | ax.set_xlabel('Cantidad de Tareas Seleccionadas') |
| | ax.set_ylabel('Modalidades') |
| | ax.set_title('Distribución de Tareas por Modalidad') |
| | ax.invert_yaxis() |
| | return fig |
| | |
| | except Exception as e: |
| | raise ValueError(f"Error al generar gráfica: {str(e)}") |
| |
|
| | |
| | def generar_encabezado(tareas_seleccionadas): |
| | MODALIDAD_TAREAS = cargar_modalidades_tareas() |
| | |
| | if not tareas_seleccionadas: |
| | raise ValueError("Selecciona al menos una tarea.") |
| | |
| | columnas = ["id"] |
| | columnas_modulos = set() |
| | |
| | for tarea in tareas_seleccionadas: |
| | for modalidad, datos in MODALIDAD_TAREAS.items(): |
| | if tarea in datos["tareas"]: |
| | |
| | for col in datos["columnas_generales"]: |
| | if col not in columnas_modulos: |
| | columnas.append(col) |
| | columnas_modulos.add(col) |
| | |
| | for col in datos["tareas"][tarea]: |
| | if col not in columnas_modulos: |
| | columnas.append(col) |
| | columnas_modulos.add(col) |
| | |
| | |
| | columnas_ordenadas = ["id"] + sorted( |
| | columnas[1:], |
| | key=lambda x: ( |
| | "input" in x, |
| | "output" in x, |
| | "label" in x |
| | ) |
| | ) |
| | |
| | return ",".join(columnas_ordenadas) |
| |
|
| | |
| | def buscar_datasets(tareas_seleccionadas, filtro_tamaño, filtro_licencia): |
| | try: |
| | |
| | query = "+".join([f"task:{tarea}" for tarea in tareas_seleccionadas]) |
| | url = f"https://huggingface.co/api/datasets?search={query}&sort=downloads" |
| | response = requests.get(url) |
| | response.raise_for_status() |
| | datasets = response.json() |
| | |
| | |
| | datasets_utiles = [] |
| | for dataset in datasets: |
| | try: |
| | |
| | if filtro_licencia and dataset.get("license", "").lower() != filtro_licencia: |
| | continue |
| | if filtro_tamaño and dataset.get("size_categories", "").lower() != filtro_tamaño: |
| | continue |
| | |
| | |
| | dataset_info = requests.get(f"https://huggingface.co/api/datasets/{dataset['id']}").json() |
| | if "features" in dataset_info: |
| | datasets_utiles.append( |
| | (dataset['id'], f"{dataset['id']} ({dataset['tags']}) - {dataset['description']}") |
| | ) |
| | |
| | except requests.exceptions.RequestException: |
| | continue |
| | |
| | return datasets_utiles |
| | |
| | except Exception as e: |
| | raise ValueError(f"Error al buscar datasets: {str(e)}") |
| |
|
| |
|
| | |
| | def generar_dataset(encabezado, datasets_seleccionados, pagina_actual=1, filas_por_pagina=5): |
| | try: |
| | columnas = encabezado.split(",") |
| | filas = [] |
| | |
| | for dataset_id in datasets_seleccionados: |
| | try: |
| | dataset = load_dataset(dataset_id, split="train") |
| | features = dataset.features |
| | |
| | |
| | mapeo = {} |
| | for col in columnas: |
| | if col == "id": |
| | mapeo[col] = lambda idx: f"id_{idx}" |
| | elif col in features: |
| | mapeo[col] = lambda fila, c=col: str(fila[c]) |
| | else: |
| | |
| | columna_alternativa = next( |
| | (k for k in features if col.split("_")[0] in k), |
| | "valor_default" |
| | ) |
| | mapeo[col] = lambda fila, c=columna_alternativa: str(fila.get(c, "N/A")) |
| | |
| | inicio = (pagina_actual - 1) * filas_por_pagina |
| | fin = pagina_actual * filas_por_pagina |
| | |
| | for i, fila in enumerate(dataset[inicio:fin]): |
| | valores = [] |
| | for col in columnas: |
| | if col == "id": |
| | valores.append(mapeo[col](i)) |
| | else: |
| | valores.append(mapeo[col](fila)) |
| | filas.append(",".join(valores)) |
| | |
| | except Exception as e: |
| | filas.append(f"Error en {dataset_id}: {str(e)}") |
| | |
| | return "\n".join([encabezado] + filas) |
| | |
| | except Exception as e: |
| | raise ValueError(f"Error al generar el dataset: {str(e)}") |