#Enviornment Setup
import time
import psutil, win32process, win32gui
import pytesseract
import json
from pytesseract import Output
pytesseract.pytesseract.tesseract_cmd=r'C:\Users\tanne\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

from datetime import datetime
from turtle import onclick
from pynput.mouse import Listener 
from PIL import ImageGrab

#Constants
SUPPORTED_APPS = ["chrome.exe","something.else"]
SEC_GAP = 3

#Function that checks what the active progarm is.
def get_window():
    pid = win32process.GetWindowThreadProcessId(win32gui.GetForegroundWindow()) #This produces a list of PIDs active window relates to
    window = psutil.Process(pid[-1]).name() #pid[-1] is the most likely to survive last longer
    return(window)

#Takes a screenshot, transcribes all text from screenshot. Saves both. Includes all areas of all screens. I considered taking a screenshot of just the running program, 
#but thought the context of the whole screen might be helpful. Plus, you basically have to take a big one, save, open, and crop. Too resource intensive.
def capture_screen_data():
    screenshot = ImageGrab.grab(all_screens=True)

    #Rename data with a little cleaning
    capture_name = str(datetime.now())
    capture_name = capture_name.replace(" ","")
    capture_name = capture_name.replace(":","")
    capture_name = capture_name.replace(".","")

    screenshot_name=capture_name + ".png"
    text_name = capture_name + ".txt"

    #Extract text from image with location data
    image_text = pytesseract.image_to_data(screenshot, output_type=Output.DICT)
    image_data = {'x':image_text['left'],'y':image_text['top'],'height':image_text['height'],'text':image_text['text'],'conf':image_text["conf"],'original_dim':screenshot.size}
    with open(text_name, "w") as f:
        json.dump(image_data,f)

    #Resize and save
    screenshot = screenshot.resize((256,256))
    screenshot.save(screenshot_name)

#Checks if the currently running program is a supported application for screenshots. Currently I'm only working on Chrome, but trying to make it easier to extend in the future, at least to Safari, Edge etc.
def conditional_screen():
    if get_window() in SUPPORTED_APPS:
        capture_screen_data()


#Loop of conditional screenshot.
def continous_screen():
    while True:
        try:
            conditional_screen()
        except Exception:
            pass
        time.sleep(SEC_GAP)


def clicks_to_txt(data):
    with open("clicks.txt","a") as f:
        f.write(str(data)+ "\n")


#What to do if the mouse clicks. We'll end up throwing away clicks that aren't close to screenshots, 
#but it's less overhead to just capture all clicks than write code to conditionaly check if the running app is a supported app.
#A click passes four arguments, which is why we have to include them. We aren't using them though.
def capture_click(x, y, button, pressed):
    if pressed:
        click_time = datetime.now()
        clicks_to_txt(str(click_time) + "," + str(x) + "," + str(y))

#Code that captures all clicks and takes a screenshot once per time gap (in seconds).
def main():
    with Listener(on_click = capture_click) as listener:
        time.sleep(SEC_GAP)
        continous_screen()
        #Apprently this .join does nothing. I'm not an expert on listeners though, and it is always included at the close of liteners form what I can tell. 
        #I think because we aren't currently closing loop it never executes, but in case in the future I close it I'm leaving the code.
        listener.join()

main()

import openai
import os
import pandas as pd
import csv

openai.organization = "org-KiHC6XImLaviPNtf8dLLxfiV"
openai.api_key = os.getenv("OPENAI_API_KEY")

Run_Open_AI = False
word_chunk_size = 300

def get_embedding(text, engine="text-similarity-ada-001"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], engine=engine)['data'][0]['embedding']

def group_words(s, n):
    words = s.split()
    for i in range(0, len(words), n):
        yield ' '.join(words[i:i+n])

assessment_data = pd.read_csv("Bio100_Assessment_Text.csv")
video_lectures = pd.read_csv("Bio100_Video_Text.csv",encoding="cp1252")

if Run_Open_AI == True:

    assessment_embeddings = list()

    for text in assessment_data["clean_text"]:
        chunked_text = list(group_words(text,300))
        chunks_embeddings = list()
        for chunk in chunked_text:
            chunks_embeddings.append(get_embedding(chunk))
        assessment_embeddings.append([sum(sub_list) / len(sub_list) for sub_list in zip(*chunks_embeddings)])
        
    with open("Bio100_Assessment_Embeddings.csv","w",newline='') as f:
        writer = csv.writer(f)
        writer.writerows(assessment_embeddings)

if Run_Open_AI == True:

    video_embeddings = list()

    for text in video_lectures["Text"]:
        chunked_text = list(group_words(text,300))
        chunks_embeddings = list()
        for chunk in chunked_text:
            chunks_embeddings.append(get_embedding(chunk))
        video_embeddings.append([sum(sub_list) / len(sub_list) for sub_list in zip(*chunks_embeddings)])
        
    with open("Bio100_Video_Embeddings.csv","w",newline='') as f:
        writer = csv.writer(f)
        writer.writerows(video_embeddings)

import praw
reddit = praw.Reddit(
    client_id="ID HERE",
    client_secret="ID HERE",
    user_agent="class project v0.1 u/Any-Fig-921",
)

Version 7.3.0 of praw is outdated. Version 7.4.0 was released 3 days ago.

IU_reddits=[]
for submission in reddit.subreddit("IndianaUniversity").hot(limit=500):
    IU_reddits.append(submission.title)
IU_reddits_text=[]
for submission in reddit.subreddit("IndianaUniversity").hot(limit=500):
    IU_reddits_text.append(submission.selftext)

import pandas as pd
data_tuples = list(zip(IU_reddits,IU_reddits_text))

Teacher_data = pd.DataFrame(data_tuples, columns=["Titles","Text"])
Teacher_data.to_csv("IU.csv")

Tanner Phillips, Ph.D. Candidate

A Few Python Projects

Mini Project 1: A utility for capturing browsing data.¶

Mini Project 2: GPT-3 API.¶

Mini Project 3: A little Web Scraping.¶

Share on