InternBot: A RAG-Powered Python Bot That Makes the Internship Search Easier

🤖 InternBot: A RAG-Powered Python Bot That Makes the Internship Search Easier

The job search can be stressful. Here’s a tool that can make things easier.

InternBot is a Python bot that finds me relevant internship postings based off of my resume and LinkedIn profile and refines my existing cover letter to them, either removing experience that might not be applicable and/or including more suitable information found on my LinkedIn/resume. Every two weeks, I receive an email notifying me of internships the bot thinks I might be interested in, with refined cover letters for each respective job listing attached.

This project uses RAG-based techniques. Retrieval-Augmented Generation, or RAG, is the process of incorporating external data into the response-generation process of a large language model. LLMs are only trained on data up to a certain point in time, and while more and more are being equipped with web-search capabilities to draw on the most up-to-date information, they still won’t have access to personal documents like one’s resume or LinkedIn profile. Adding to the InternBot the ability to access this kind of information allows for more personalized results compared to if we simply asked ChatGPT to do this task for us.

Before we run through an example of the bot in action, let’s break down the code!

import os
import re
import time
import pandas as pd
from datetime import date
from email.message import EmailMessage
import requests
import smtplib
from docx import Document
from docx.oxml.ns import qn
from docx.shared import Pt, RGBColor
from openai import OpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

load_dotenv()

OPENAI_KEY = os.getenv("OPENAI_KEY")
client = OpenAI(api_key=OPENAI_KEY)

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")

Import the required documents and set up the OpenAI client and Google Search Engine keys and IDs.

# OpenAI text completion
def openai_completion(prompt: str):
    completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "user",
            "content": prompt
        }
    ]
    )
    return completion.choices[0].message.content

# OpenAI web search
def openai_web_search(query: str) -> str:
    completion = client.chat.completions.create(
        model="gpt-4o-search-preview",
        web_search_options={},
        messages=[{
            "role": "user",
            "content": query,
        }]
    )

    return completion.choices[0].message.content

# Return "Summer" + the current year or the next year depending on the current month the student is searching for a job
def job_year() -> str:
    today = date.today()
    current_month = today.month
    current_year = today.year

    # May to Dec
    if 5 <= current_month <= 12:
        return f"Summer {current_year + 1}"
    # Jan to Apr
    else:
        return f"Summer {current_year}"

# Return the recruitment cycle based on the month
def recruitment_cycle(month: int) -> str:
    if 8 <= month <= 10:
        return "Fall"
    elif month >= 11 or month <= 1:
        return "Winter"
    elif 2 <= month <= 4:
        return "Spring"
    else:
        return "Summer"

# Send an email with a docx attachment
def send_email_with_docx(file_path, subject):
    EMAIL_ADDRESS = os.getenv("EMAIL_ADDRESS")
    EMAIL_PASSWORD = os.getenv("EMAIL_PASSWORD") 
    TO_EMAIL = os.getenv("TO_EMAIL")  

    msg = EmailMessage()
    msg["Subject"] = subject
    msg["From"] = EMAIL_ADDRESS
    msg["To"] = TO_EMAIL
    msg.set_content("Biweekly Cover Letter!")

    # Attach the .docx file
    with open(file_path, "rb") as f:
        file_data = f.read()
        file_name = file_path.split("/")[-1]
        msg.add_attachment(file_data, maintype="application",
                           subtype="vnd.openxmlformats-officedocument.wordprocessingml.document",
                           filename=file_name)

    # Connect and send email
    with smtplib.SMTP_SSL("smtp.gmail.com", 465) as smtp:
        smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
        smtp.send_message(msg)

Define the following preliminary functions:

openai_completion(prompt): Uses OpenAI’s completions API endpoint to generate a response to a given prompt
openai_web_search(query): Uses OpenAI’s web-enabled model to generate a response to a given prompt
job_year(): Returns "Summer" + the current year or the next year depending on the current month the student is searching for a job
recruitment_cycle(month): Returns the recruitment cycle based on the inputted month
send_email_with_docx(file_path, subject): Sends an email with a DOCX attachment (the cover letter, as we’ll see later on)

# Extract linkedin profile info
def linkedin_info() -> str:
    # Google search for linkedin profile url
    google_url = "https://www.googleapis.com/customsearch/v1"

    linkedin_url_query = "Teddy Porfiris linkedin profile"

    google_params = {
        "key": GOOGLE_API_KEY,
        "cx": SEARCH_ENGINE_ID,
        "q": linkedin_url_query
    }

    google_response = requests.get(google_url, params=google_params)
    google_results = google_response.json()
    
    
    linkedin_url_prompt = f"""
    Find the Teddy Porfiris linkedin profile url in the following block of text. Return only the full LinkedIn URL, and nothing else.

    {google_results}
    """

    profile_url = openai_completion(linkedin_url_prompt)

    print('--------------------------------profile_url--------------------------------')
    print(profile_url)


    # Gather information from linkedin profile
    linkedin_info_query = f"Teddy Porfiris {profile_url}"

    google_params = {
        "key": GOOGLE_API_KEY,
        "cx": SEARCH_ENGINE_ID,
        "q": linkedin_info_query
    }

    google_response = requests.get(google_url, params=google_params)
    google_results = google_response.json()

    # Clean the google results of the linkedin profile information
    clean_google_prompt = f"""
    You are a helpful assistant designed to extract structured information from LinkedIn profile summaries and related text.

    Given the following block of text, extract all relevant information related to:
    1. Education (degrees, schools, graduation years if available)
    2. Work experience (job titles, companies, durations, responsibilities)
    3. Skills (technical or soft skills mentioned)
    4. Most recent projects or publications (titles, topics, dates if any)

    If a specific piece of information is not available (e.g., degree, location, duration), do not include that field in the output. Only include fields with clearly identifiable information.

    The output should be a simple string text block with the information.

    Text:
    \"\"\"{google_results}\"\"\"
    """

    linkedin_info = openai_completion(clean_google_prompt)
    print('--------------------------------linkedin_info--------------------------------')
    print(linkedin_info)

    return linkedin_info

The linkedin_info() function uses the Google search API (Custom Search API, specifically) to find the LinkedIn profile URL of a given name and extract information from the page. Not all of the information retrieved is relevant, however, so this raw data is then passed into the openai_completion() function so only the student’s education, work experience, and projects remain.

# Extract resume info
def resume_info() -> str:
    # Load resume docx
    doc = Document("Teddy_Porfiris_Resume.docx")

    text = []

    # Extract paragraphs
    text.extend([para.text for para in doc.paragraphs if para.text.strip() != ""])

    # Extract text from tables
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                text.append(cell.text.strip())

    text = "\n".join(text)

    return text

The resume_info() function is similar to the linkedin_info() function where it extracts and returns the content found on a resume.

# Return job search queries based on personal info (ex. linkedin profile, resume, etc.)
def job_search_queries_personal_info(personal_info: str) -> list[str]:
    job_search_query_prompt = f"""
    You are an intelligent assistant that helps students find relevant internship opportunities based on their personal information and past experience.
    Given the following structured profile data, generate 1 to 2 concise job search queries that a student can type into a search engine or job board to find internships that match their background.
    Profile: 

    {personal_info}

    Instructions:
    Focus only on {job_year()} internship opportunities
    Use keywords from their past job titles or responsibilities
    Align with their degree or field of study
    Keep each query short, focused, and to the point (5–6 words max)
    Include relevant job titles or roles (e.g., "accounting intern", "property management internship")
    Optimize for job listings posted directly on company websites.
    Return only the list of search queries, one per line with no dashes, numbers, or bullet points. Just plain text.
    Format as keyword-style queries, not sentences. Do not include connecting words like “at,” “for,” “with,” etc. (e.g., write finance data internship summer 2026, not Finance internship at a data firm)
    Do not include school names/programs, class year, or verbose phrases like “student at...”
    """
    
    job_search_queries_string = openai_completion(job_search_query_prompt)
    job_search_queries_list = job_search_queries_string.strip().split('\n')

    print('--------------------------------job_search_queries_list--------------------------------')
    print(job_search_queries_list)

    return job_search_queries_list

The job_search_queries_personal_info() function takes in information about a student (the content previously retrieved from their LinkedIn profile or resume) and uses the OpenAI API to generate relevant search queries for jobs/fields that might appeal to the student.

# Create a vector store of past job listings
past_job_listings_df = pd.read_csv("fake_job_listings.csv")

# Convert each row to a descriptive string
past_job_listings_texts = past_job_listings_df.apply(lambda row: f"Company: {row['Company Name']} | Industry: {row['Industry']} | Job: {row['Job Listing']} | Opens: {row['Day Application Opens']}", axis=1).tolist()

# Create LangChain Document objects
documents = [Document(page_content=text) for text in past_job_listings_texts]

# Create Chroma vector store
vectorstore = Chroma.from_documents(documents, embedding_model, persist_directory="job_listings_chroma_store")

# Save the store to disk
vectorstore.persist()

# Load the vector store
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
past_job_listings_vectorstore = Chroma(persist_directory="job_listings_chroma_store", embedding_function=embedding_model)
retriever = past_job_listings_vectorstore.as_retriever(search_kwargs={"k": 1})

# Return job search queries based on relevant past job listings and when they were posted
def job_search_queries_vector_store(personal_info: str) -> list[str]:
    vectorstore_query_prompt = f"""
    You are an intelligent assistant that generates vectorstore queries based on a past job experience.
    Given the following information from someone's personal information, find the past job experience and generate a vectorstore query that will return job listings related to that past job experience.

    {personal_info}
    """

    vectorstore_query = openai_completion(vectorstore_query_prompt)
    print('--------------------------------vectorstore_query--------------------------------')
    print(vectorstore_query)

    vectorstore_results = retriever.get_relevant_documents(vectorstore_query)

    # Extract company name, industry, and date the job listing was posted
    job_listings = []
    for doc in vectorstore_results:
        print('--------------------------------vectorstore_results--------------------------------')
        print(doc.page_content)

        pattern = r"Company:\s*(.*?)\s*\|\s*Industry:\s*(.*?)\s*\|\s*Job:.*?\|\s*Opens:\s*(\d{4}-\d{2}-\d{2})"
        match = re.search(pattern, doc.page_content)

        if match:
            company = match.group(1)
            industry = match.group(2)
            date_open = match.group(3)
        else:
            print("No match found.")
        

        # Determine the recruitment cycle this job listing was posted in last year
        past_job_month = int(date_open.split('-')[1])

        past_job_recruitment_cycle = recruitment_cycle(past_job_month)

        current_recruitment_cycle = recruitment_cycle(date.today().month)
        
        # If this job listing was posted in the same recruitment cycle as the one we are currently in, search for job listings from this company (the company might have released a new listing since they released the previous listing around this time last year)
        if past_job_recruitment_cycle == current_recruitment_cycle:
            job_listings.append(f"{company} {industry} internship {job_year()}")
        else:
            print("No match found.")
    
    return job_listings

Often, universities provide students with documents and spreadsheets that contain data on previous job listings and other recruitment info. It would be helpful to see when industries that the student is interested in released internship postings in the past, and if we are currently in the same recruitment cycle, search for those types of jobs.

To illustrate this, I created a sample CSV file containing historical job listing data of fake companies. Each row includes the company name, industry, job title, and the application opening date. The dataset entries are converted into embeddings, which are numerical representations of text that retain its meaning. These embeddings are stored in a ChromaDB vector store called job_listings_chroma_store.

Within the job_search_queries_vector_store() function, a vector store query is generated based on the student’s past experience. This way, when we query the vector database, it will return job listings relevant to that student. This query is embedded and used to search the vector database, returning job listings most relevant to the student by identifying the closest matches in the high-dimensional embedding space.

Once relevant entries are extracted, the recruitment_cycle() function is used to determine if this job listing was posted in the same recruitment cycle as the one we are currently in. If so, we will check later to see if the company has released a new listing this year.

# Uses the google search api to find listing urls and passes them to the opeanai web search api to pull the information from the job listing pages
def google_job_listing_info(job_search_queries_list: list[str]) -> list[str]:
    job_search_results = []
    # Perform Google search for each job search query
    for query in job_search_queries_list:
        print(f"Searching: {query}")

        google_url = "https://www.googleapis.com/customsearch/v1"
        google_params = {
            "key": GOOGLE_API_KEY,
            "cx": SEARCH_ENGINE_ID,
            "q": query,
            "num": 2
        }

        try:
            response = requests.get(google_url, params=google_params)
            data = response.json()

            job_search_results.append({
                "query": query,
                "result": data
            })
        except Exception as e:
            print(f"Error for query: {query} → {e}")
            job_search_results.append({
                "query": query,
                "result": "Request failed or rate limited."
            })

        time.sleep(3)  # Add a 3-second delay between requests to avoid rate limiting


    # Extract the URLs of the job listings
    job_url_prompt = f"""
    You are an intelligent assistant that helps students find relevant internship opportunities based on their personal information and past experience.
    Given the following structured job search results, extract the URLs of the job listings. The output should be a string of the URLs, one per line with no dashes, numbers, or bullet points. Just plain text.
    Ignore any job listings from Indeed or LinkedIn. Only include job listings from company websites. Ensure each URL links to a specific job listing.

    {job_search_results}
    """
    job_urls = openai_completion(job_url_prompt) # string of urls

    print('--------------------------------job_urls--------------------------------')
    print(job_urls)

    job_urls_list = job_urls.strip().split('\n') # list of urls

    # Filter out LinkedIn, Indeed, ZipRecruiter URLs (these sites block search APIs and don't yield accurate results)
    job_urls_list = [url for url in job_urls_list if "linkedin.com" not in url and "indeed.com" not in url and "ziprecruiter.com" not in url]

    # Use the openai web search api to extract the information from the job listing pages
    job_listing_info_list = []
    for url in job_urls_list:
        job_listing_info_prompt = f"""
        You are a helpful assistant that analyzes the contents of a webpage to determine the type of job-related content it contains and, if applicable, extract relevant information.

        Given the following job listing url, your task is to extract all information about the job listing. If the page says the application is closed, do not return anything.

        Your output should be a string of the following format:
        1. The url I have provided (formatted as "JOB LISTING URL: <url>")
        2. All of the extracted information from the job listing

        The url is:
        {url}
        """
        job_listing_info_list.append(openai_web_search(job_listing_info_prompt))
        time.sleep(3)


    # Remove information that is not about a specific job listing or contains incomplete information
    for job_listing_info in job_listing_info_list:
        filter_job_listing_info_prompt = f"""
        You are an expert job listing classifier. Based on the provided text content of a webpage, do the following:
        1. Check if the job listing information is less than 6 sentences. If it is, return “Not a specific listing”.
        2. Check if the information is not of a specific job listing, but a description of multiple job listings and/or the company internship program. If it is, return “Not a specific listing”.
        3. Check if the job listing is no longer available or has closed. If it is, return “Not a specific listing”.
        If none of the above conditions are met, return “Specific listing”.

        The content:
        {job_listing_info}
        """

        job_listing_info_classification = openai_completion(filter_job_listing_info_prompt)

        if job_listing_info_classification == "Not a specific listing":
            job_listing_info_list.remove(job_listing_info)

    return job_listing_info_list

‌
‌
‌

InternBot: A RAG-Powered Python Bot That Makes the Internship Search Easier

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}🤖 InternBot: A RAG-Powered Python Bot That Makes the Internship Search Easier

🤖 InternBot: A RAG-Powered Python Bot That Makes the Internship Search Easier