A Few Python Projects
Mini Project 1: A utility for capturing browsing data.¶
This is the first part of an ongoing project to create an AI assistant that automatically looks at the websites you visit and currates what it thinks will be the most interetsing content for you. This is just the data collection portion. The code below:
- Checks if you're using a supported browser (just working with chrome for the alpha).
- If you are, takes a screenshot every 3 seconds and saves it as a low-res .png.
- Transcribes everything on the screen and saves it in a usable JSON format that includes location and size of text.
#Enviornment Setup
import time
import psutil, win32process, win32gui
import pytesseract
import json
from pytesseract import Output
pytesseract.pytesseract.tesseract_cmd=r'C:\Users\tanne\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
from datetime import datetime
from turtle import onclick
from pynput.mouse import Listener 
from PIL import ImageGrab
#Constants
SUPPORTED_APPS = ["chrome.exe","something.else"]
SEC_GAP = 3
This first section of code contains the function for capturing a screenshot and saving it, as well as transcribing the content of the screen and saving it as JSON.
#Function that checks what the active progarm is.
def get_window():
    pid = win32process.GetWindowThreadProcessId(win32gui.GetForegroundWindow()) #This produces a list of PIDs active window relates to
    window = psutil.Process(pid[-1]).name() #pid[-1] is the most likely to survive last longer
    return(window)
#Takes a screenshot, transcribes all text from screenshot. Saves both. Includes all areas of all screens. I considered taking a screenshot of just the running program, 
#but thought the context of the whole screen might be helpful. Plus, you basically have to take a big one, save, open, and crop. Too resource intensive.
def capture_screen_data():
    screenshot = ImageGrab.grab(all_screens=True)
    #Rename data with a little cleaning
    capture_name = str(datetime.now())
    capture_name = capture_name.replace(" ","")
    capture_name = capture_name.replace(":","")
    capture_name = capture_name.replace(".","")
    screenshot_name=capture_name + ".png"
    text_name = capture_name + ".txt"
    #Extract text from image with location data
    image_text = pytesseract.image_to_data(screenshot, output_type=Output.DICT)
    image_data = {'x':image_text['left'],'y':image_text['top'],'height':image_text['height'],'text':image_text['text'],'conf':image_text["conf"],'original_dim':screenshot.size}
    with open(text_name, "w") as f:
        json.dump(image_data,f)
    #Resize and save
    screenshot = screenshot.resize((256,256))
    screenshot.save(screenshot_name)
Instead of creating the worlds must unreadable function, we take our screenshot from above and then build the conditions for taking screenshots up over time by nesting it in a couple other functions.
- First function checks if they're in a supported app
- Second function loop continously
- Third and fourth functions add the capturing of a click-stream data
- Fifth function rolls it all together in a "main" function.
#Checks if the currently running program is a supported application for screenshots. Currently I'm only working on Chrome, but trying to make it easier to extend in the future, at least to Safari, Edge etc.
def conditional_screen():
    if get_window() in SUPPORTED_APPS:
        capture_screen_data()
#Loop of conditional screenshot.
def continous_screen():
    while True:
        try:
            conditional_screen()
        except Exception:
            pass
        time.sleep(SEC_GAP)
def clicks_to_txt(data):
    with open("clicks.txt","a") as f:
        f.write(str(data)+ "\n")
#What to do if the mouse clicks. We'll end up throwing away clicks that aren't close to screenshots, 
#but it's less overhead to just capture all clicks than write code to conditionaly check if the running app is a supported app.
#A click passes four arguments, which is why we have to include them. We aren't using them though.
def capture_click(x, y, button, pressed):
    if pressed:
        click_time = datetime.now()
        clicks_to_txt(str(click_time) + "," + str(x) + "," + str(y))
#Code that captures all clicks and takes a screenshot once per time gap (in seconds).
def main():
    with Listener(on_click = capture_click) as listener:
        time.sleep(SEC_GAP)
        continous_screen()
        #Apprently this .join does nothing. I'm not an expert on listeners though, and it is always included at the close of liteners form what I can tell. 
        #I think because we aren't currently closing loop it never executes, but in case in the future I close it I'm leaving the code.
        listener.join()
All you have to do is run "main" and the program will run in the background of your computer.
main()
Mini Project 2: GPT-3 API.¶
As part of the dashboards I'm designing for my dissertation, I use GPT-3's word embeddings to make sense of the relationships between class content and assessment data. This code takes the video lectures and assessment text from a introductory biology course and gets the GPT-3 word embeddings for them.
import openai
import os
import pandas as pd
import csv
openai.organization = "org-KiHC6XImLaviPNtf8dLLxfiV"
openai.api_key = os.getenv("OPENAI_API_KEY")
Run_Open_AI = False
word_chunk_size = 300
def get_embedding(text, engine="text-similarity-ada-001"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], engine=engine)['data'][0]['embedding']
def group_words(s, n):
    words = s.split()
    for i in range(0, len(words), n):
        yield ' '.join(words[i:i+n])
assessment_data = pd.read_csv("Bio100_Assessment_Text.csv")
video_lectures = pd.read_csv("Bio100_Video_Text.csv",encoding="cp1252")
if Run_Open_AI == True:
    assessment_embeddings = list()
    for text in assessment_data["clean_text"]:
        chunked_text = list(group_words(text,300))
        chunks_embeddings = list()
        for chunk in chunked_text:
            chunks_embeddings.append(get_embedding(chunk))
        assessment_embeddings.append([sum(sub_list) / len(sub_list) for sub_list in zip(*chunks_embeddings)])
        
    with open("Bio100_Assessment_Embeddings.csv","w",newline='') as f:
        writer = csv.writer(f)
        writer.writerows(assessment_embeddings)
if Run_Open_AI == True:
    video_embeddings = list()
    for text in video_lectures["Text"]:
        chunked_text = list(group_words(text,300))
        chunks_embeddings = list()
        for chunk in chunked_text:
            chunks_embeddings.append(get_embedding(chunk))
        video_embeddings.append([sum(sub_list) / len(sub_list) for sub_list in zip(*chunks_embeddings)])
        
    with open("Bio100_Video_Embeddings.csv","w",newline='') as f:
        writer = csv.writer(f)
        writer.writerows(video_embeddings)
Mini Project 3: A little Web Scraping.¶
For a class a TA'd we needed data for students to perform some basic topical analysis on. I grabbed the last 500 Reddit posts from the Indiana University subreddit with the script below.
import praw
reddit = praw.Reddit(
    client_id="ID HERE",
    client_secret="ID HERE",
    user_agent="class project v0.1 u/Any-Fig-921",
)
Version 7.3.0 of praw is outdated. Version 7.4.0 was released 3 days ago.
IU_reddits=[]
for submission in reddit.subreddit("IndianaUniversity").hot(limit=500):
    IU_reddits.append(submission.title)
IU_reddits_text=[]
for submission in reddit.subreddit("IndianaUniversity").hot(limit=500):
    IU_reddits_text.append(submission.selftext)
import pandas as pd
data_tuples = list(zip(IU_reddits,IU_reddits_text))
Teacher_data = pd.DataFrame(data_tuples, columns=["Titles","Text"])
Teacher_data.to_csv("IU.csv")
