enrich_with_existence()¶
Class: JobsAustriaCacheProcessRework
File: jobs_austria_cache_key_sync.py ยท line 40
Step 2: Check if url_hashes exist in the 'jobs' table and map jobs_id.
Signature¶
| Parameters | df_cache |
| Returns | not annotated |
| Async | No |
| Visibility | Public |
Implementation¶
def enrich_with_existence(self, df_cache: pd.DataFrame) -> pd.DataFrame:
"""Step 2: Check if url_hashes exist in the 'jobs' table and map jobs_id."""
if df_cache.empty:
return df_cache
unique_hashes = tuple(df_cache['url_hash'].unique().tolist())
jobs_query = text("SELECT id as jobs_id, url_hash FROM jobs WHERE url_hash IN :hashes")
with self.engine.connect() as connection:
df_jobs = pd.read_sql(jobs_query, connection, params={"hashes": unique_hashes})
# Left join to see which hashes are already in the jobs table
enriched_df = pd.merge(df_cache, df_jobs, on='url_hash', how='left')
enriched_df['is_in_database'] = enriched_df['jobs_id'].notna()
return enriched_df