Skip to content

enrich_with_existence()

Class: JobsAustriaCacheProcessRework
File: jobs_austria_cache_key_sync.py ยท line 40

Step 2: Check if url_hashes exist in the 'jobs' table and map jobs_id.

Signature

Parameters df_cache
Returns not annotated
Async No
Visibility Public

Implementation

def enrich_with_existence(self, df_cache: pd.DataFrame) -> pd.DataFrame:
    """Step 2: Check if url_hashes exist in the 'jobs' table and map jobs_id."""
    if df_cache.empty:
        return df_cache

    unique_hashes = tuple(df_cache['url_hash'].unique().tolist())
    jobs_query = text("SELECT id as jobs_id, url_hash FROM jobs WHERE url_hash IN :hashes")

    with self.engine.connect() as connection:
        df_jobs = pd.read_sql(jobs_query, connection, params={"hashes": unique_hashes})

    # Left join to see which hashes are already in the jobs table
    enriched_df = pd.merge(df_cache, df_jobs, on='url_hash', how='left')
    enriched_df['is_in_database'] = enriched_df['jobs_id'].notna()

    return enriched_df