Skip to content

synchronize_fk_id()

Class: JobsAustriaCacheSynchronizer
File: jobs_austria_cache_synchronizer.py ยท line 66

Matches scrape_cache rows to jobs via url_hash. Writes jobs.id back into scrape_cache.fk_job_id for every match. Returns df enriched with a 'jobs_id' column (NaN for unmatched rows).

Signature

Parameters df
Returns not annotated
Async No
Visibility Public

Implementation

def synchronize_fk_id(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    Matches scrape_cache rows to jobs via url_hash.
    Writes jobs.id back into scrape_cache.fk_job_id for every match.
    Returns df enriched with a 'jobs_id' column (NaN for unmatched rows).
    """
    unique_hashes = df['url_hash'].dropna().unique().tolist()
    if not unique_hashes:
        df['jobs_id'] = pd.NA
        return df

    stmt = text(
        "SELECT id AS jobs_id, url_hash FROM jobs WHERE url_hash IN :hashes"
    ).bindparams(bindparam("hashes", expanding=True))

    with self.engine.connect() as conn:
        df_jobs = pd.read_sql(stmt, conn, params={"hashes": unique_hashes})

    df_enriched = pd.merge(df, df_jobs, on='url_hash', how='left')

    to_update = df_enriched[df_enriched['jobs_id'].notna()][['scrape_cache_id', 'jobs_id']]
    if not to_update.empty:
        self._bulk_update_scrape_cache_fk(to_update)
        logs.info(f"synchronize_fk_id: linked {len(to_update)} rows to jobs.")
    else:
        logs.info("synchronize_fk_id: no matching jobs found for this batch.")

    return df_enriched