from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


import os, json, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


BASE = "/content/drive/MyDrive/MCDI/5A_AdA"

def cargar_json(nombre):
    with open(os.path.join(BASE, nombre), encoding="utf-8") as f:
        return json.load(f)


A_raw = cargar_json("postinglists-for-intersection-A-k=2.json")
B_raw = cargar_json("postinglists-for-intersection-B-k=3.json")
C_raw = cargar_json("postinglists-for-intersection-C-k=4.json")
print(len(A_raw), len(B_raw), len(C_raw))
print(len(A_raw[0]), len(B_raw[0]), len(C_raw[0]))

200 200 200
2 3 4


def normalizar_familia(F):
    F_norm = []
    for grupo in F:
        nuevo_grupo = []
        for L in grupo:
            L_norm = sorted(set(L))
            nuevo_grupo.append(L_norm)
        F_norm.append(nuevo_grupo)
    return F_norm
A = normalizar_familia(A_raw)
B = normalizar_familia(B_raw)
C = normalizar_familia(C_raw)


class CmpCounter:
    def __init__(self):
        self.count = 0
    def lt(self, a, b):
        self.count += 1
        return a < b
    def le(self, a, b):
        self.count += 1
        return a <= b
    def eq(self, a, b):
        self.count += 1
        return a == b
def medir_tiempo(fn, *args, **kwargs):
    t0 = time.perf_counter()
    res = fn(*args, **kwargs)
    t1 = time.perf_counter()
    return res, (t1 - t0)


def findpos_binaria(A, x, lo, cmp: CmpCounter):
    hi = len(A)
    while lo < hi:
        mid = (lo + hi) // 2
        if cmp.lt(A[mid], x):
            lo = mid + 1
        else:
            hi = mid
    return lo

def findpos_B1(A, x, lo, cmp: CmpCounter):
    n = len(A)
    if lo >= n:
        return n
    if not cmp.lt(A[lo], x):
        return lo
    bound = 1
    while lo + bound < n:
        if cmp.lt(A[lo + bound], x):
            bound *= 2
        else:
            break
    left = lo + bound // 2
    right = min(lo + bound, n)
    if left < lo:
        left = lo

    while left < right:
        mid = (left + right) // 2
        if cmp.lt(A[mid], x):
            left = mid + 1
        else:
            right = mid
    return left

def findpos_B2(A, x, lo, cmp: CmpCounter):
    n = len(A)
    if lo >= n:
        return n
    if not cmp.lt(A[lo], x):
        return lo
    step = 2
    prev = 0
    while True:
        idx = lo + step
        if idx >= n:
            left = lo + prev
            right = n
            break
        if cmp.lt(A[idx], x):
            prev = step
            step = step * step
        else:
            left = lo + prev
            right = idx
            break

    if left >= n:
        return n

    while left < right:
        mid = (left + right) // 2
        if cmp.lt(A[mid], x):
            left = mid + 1
        else:
            right = mid
    return left


def by_intersection_2(A, B, findpos, cmp: CmpCounter,
                      a_lo=0, a_hi=None, b_lo=0, b_hi=None, out=None):
    if a_hi is None:
        a_hi = len(A)
    if b_hi is None:
        b_hi = len(B)
    if out is None:
        out = []
    if a_lo >= a_hi or b_lo >= b_hi:
        return out
    mid = (a_lo + a_hi) // 2
    median = A[mid]
    pos = findpos(B, median, b_lo, cmp)
    if pos >= b_hi:
        return by_intersection_2(A, B, findpos, cmp,
                                 a_lo, mid, b_lo, b_hi, out)
    if cmp.eq(B[pos], median):
        by_intersection_2(A, B, findpos, cmp,
                          a_lo, mid, b_lo, pos, out)
        out.append(median)
        by_intersection_2(A, B, findpos, cmp,
                          mid + 1, a_hi, pos + 1, b_hi, out)
    else:
        by_intersection_2(A, B, findpos, cmp,
                          a_lo, mid, b_lo, pos, out)
        by_intersection_2(A, B, findpos, cmp,
                          mid + 1, a_hi, pos, b_hi, out)
    return out


def BY_binaria(A, B, cmp):
    return by_intersection_2(A, B, findpos_binaria, cmp)

def BY_B1(A, B, cmp):
    return by_intersection_2(A, B, findpos_B1, cmp)

def BY_B2(A, B, cmp):
    return by_intersection_2(A, B, findpos_B2, cmp)


def BY_multi(lists, base_BY, cmp: CmpCounter):
    if not lists:
        return []
    res = lists[0]
    for nxt in lists[1:]:
        res = base_BY(res, nxt, cmp)
        if not res:
            break
    return res
def BY_multi_binaria(lists, cmp):
    return BY_multi(lists, BY_binaria, cmp)
def BY_multi_B1(lists, cmp):
    return BY_multi(lists, BY_B1, cmp)
def BY_multi_B2(lists, cmp):
    return BY_multi(lists, BY_B2, cmp)


def ME_melding(lists, base_intersection, cmp: CmpCounter):
    if not lists:
        return []
    work = sorted(lists, key=len)
    curr = work[0]
    for nxt in work[1:]:
        curr = base_intersection(curr, nxt, cmp)
        if not curr:
            break
    return curr
def ME_BY_binaria(lists, cmp):
    return ME_melding(lists, BY_binaria, cmp)


def BK_intersection(lists, findpos, cmp: CmpCounter):
    if any(len(lst) == 0 for lst in lists):
        return []
    L = lists
    k = len(L)
    if k == 0:
        return []
    if k == 1:
        return L[0][:]
    P = [0] * k
    out = []
    el = L[0][0]
    while True:
        c = 0
        for i in range(k):
            pos = findpos(L[i], el, P[i], cmp)
            if pos >= len(L[i]):
                return out
            P[i] = pos
            val = L[i][pos]
            if cmp.eq(val, el):
                c += 1
            else:
                el = val
                c = 1
        if c == k:
            out.append(el)
            P[0] += 1
            if P[0] >= len(L[0]):
                return out
            el = L[0][P[0]]
def BK_binaria(lists, cmp):
    return BK_intersection(lists, findpos_binaria, cmp)


def correr_experimento(familia, grupo_id, listas, algoname, algofn, reps=100):
    filas = []
    for r in range(reps):
        cmp = CmpCounter()
        inter, dt = medir_tiempo(algofn, listas, cmp)
        filas.append({
            "familia": familia,
            "grupo": grupo_id,
            "k": len(listas),
            "algoritmo": algoname,
            "rep": r,
            "tiempo_s": dt,
            "comparaciones": cmp.count,
            "len_inter": len(inter),
        })
    return pd.DataFrame(filas)


ALGORITMOS_5 = {
    "ME":          lambda L, cmp: ME_BY_binaria(L, cmp),
    "BK":          lambda L, cmp: BK_binaria(L, cmp),
    "BY_binaria":  lambda L, cmp: BY_multi_binaria(L, cmp),
    "BY_B1":       lambda L, cmp: BY_multi_B1(L, cmp),
    "BY_B2":       lambda L, cmp: BY_multi_B2(L, cmp),}


def correr_todos(A, B, C, reps=100):
    dfs = []
    for familia, F in [("A", A), ("B", B), ("C", C)]:
        for gid, listas in enumerate(F):
            for algoname, algofn in ALGORITMOS_5.items():
                dfs.append(
                    correr_experimento(familia, gid, listas,
                                       algoname, algofn, reps))
    return pd.concat(dfs, ignore_index=True)
df_res = correr_todos(A, B, C, reps=100)


df_res.head(5)


resumen = (df_res
    .groupby(["familia", "algoritmo"])
    .agg(
        n_obs=("tiempo_s", "count"),
        tiempo_medio=("tiempo_s", "mean"),
        tiempo_std=("tiempo_s", "std"),
        comps_medias=("comparaciones", "mean"),
        inter_medias=("len_inter", "mean"),)
    .reset_index())
resumen["tiempo_medio"] = resumen["tiempo_medio"].round(6)
resumen["tiempo_std"]   = resumen["tiempo_std"].round(6)
resumen["comps_medias"] = resumen["comps_medias"].round(1)
resumen["inter_medias"] = resumen["inter_medias"].round(2)
resumen


# Filtrar solo la familia A
df_A = df_res[df_res["familia"] == "A"]
metricas = [("tiempo_s",      "segundos",               "Tiempo de intersección"),
    ("comparaciones", "número de comparaciones","Comparaciones de intersección"),]
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for j, (col, xlabel, titulo_base) in enumerate(metricas):
    ax = axes[j]
    df_A.boxplot(
        column=col,
        by="algoritmo",
        vert=False,
        ax=ax,)
    ax.set_title(f"{titulo_base} - Familia A", fontsize=10)
    ax.set_xlabel(xlabel)
    ax.set_ylabel("algoritmo" if j == 0 else "")
plt.suptitle("")
plt.tight_layout()
plt.show()


df_B = df_res[df_res["familia"] == "B"]
metricas = [("tiempo_s",      "segundos",               "Tiempo de intersección"),
    ("comparaciones", "número de comparaciones","Comparaciones de intersección"),]
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for j, (col, xlabel, titulo_base) in enumerate(metricas):
    ax = axes[j]
    df_B.boxplot(
        column=col,
        by="algoritmo",
        vert=False,
        ax=ax,    )
    ax.set_title(f"{titulo_base} - Familia B", fontsize=10)
    ax.set_xlabel(xlabel)
    ax.set_ylabel("algoritmo" if j == 0 else "")
plt.suptitle("")
plt.tight_layout()
plt.show()


df_C = df_res[df_res["familia"] == "C"]
metricas = [("tiempo_s",      "segundos",               "Tiempo de intersección"),
    ("comparaciones", "número de comparaciones","Comparaciones de intersección"),]
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for j, (col, xlabel, titulo_base) in enumerate(metricas):
    ax = axes[j]
    df_C.boxplot(
        column=col,
        by="algoritmo",
        vert=False,
        ax=ax,    )
    ax.set_title(f"{titulo_base} - Familia C", fontsize=10)
    ax.set_xlabel(xlabel)
    ax.set_ylabel("algoritmo" if j == 0 else "")
plt.suptitle("")
plt.tight_layout()
plt.show()

	familia	algoritmo	n_obs	tiempo_medio	tiempo_std	comps_medias	inter_medias
0	A	BK	20000	0.000396	0.000205	1986.3	18.14
1	A	BY_B1	20000	0.000386	0.000208	1547.2	18.14
2	A	BY_B2	20000	0.000343	0.000154	1553.1	18.14
3	A	BY_binaria	20000	0.000448	0.000286	2275.3	18.14
4	A	ME	20000	0.000257	0.000119	1209.6	18.14
5	B	BK	20000	0.001083	0.000572	5545.3	23.62
6	B	BY_B1	20000	0.002498	0.004175	9633.2	23.62
7	B	BY_B2	20000	0.002269	0.004067	9636.0	23.62
8	B	BY_binaria	20000	0.004138	0.010280	20825.4	23.62
9	B	ME	20000	0.000561	0.000326	2636.3	23.62
10	C	BK	20000	0.000559	0.000219	2879.0	7.75
11	C	BY_B1	20000	0.001049	0.001311	4372.3	7.75
12	C	BY_B2	20000	0.000953	0.001201	4298.8	7.75
13	C	BY_binaria	20000	0.001491	0.003193	7349.9	7.75
14	C	ME	20000	0.000299	0.000125	1494.5	7.75

PRÁCTICA 5A: REPORTE ESCRITO. EXPERIMENTOS Y ANÁLISIS DE ALGORITMOS DE INTERSECCIÓN DE CONJUNTOS.¶

Introducción¶

Conexión de Google Drive con Google Colab¶

Importación de librerías y carga de datos¶

Limpieza de datos¶

Contador de comparaciones y medición de tiempo¶

Algoritmos de búsqueda: binaria, B1 y B2¶

Intersección de dos listas con el algoritmo de Baeza–Yates¶

Variantes de Baeza–Yates según la búsqueda utilizada¶

Intersección de varias listas con Baeza–Yates¶

Intersección Melding (ME) con estrategia small-vs-small¶

Intersección de varias listas con el algoritmo de Barbay y Kenyon (BK)¶

Ejecución de un experimento¶

Definición de los algoritmos a comparar¶

Bucle principal de experimentos y construcción del DataFrame global¶

Tabla resumen por familia y algoritmo¶

Conclusiones¶

Referencias¶

	familia	k	algoritmo	rep	tiempo_s	comparaciones	len_inter
0	A	2	ME	0	0.000715	1564	2
1	A	2	ME	1	0.002145	1564	2
2	A	2	ME	2	0.000540	1564	2
3	A	2	ME	3	0.001504	1564	2
4	A	2	ME	4	0.000523	1564	2