1+ class QuesAnsVirtualAssistant :
2+ """
3+ Used for question-answering
4+
5+ We want a table like this
6+ id | Question | Answer
7+
8+ Programmatically represented as
9+ {
10+ id: {question: ..., answer: ...}
11+ }
12+ """
13+ pass
14+
15+
16+
17+
18+
19+
20+
21+
22+
23+ import sqlite3
24+ import json
25+ import pandas as pd
26+ import sklearn
27+ from sklearn .feature_extraction .text import TfidfVectorizer
28+
29+ class QuestionAnswerVirtualAssistant :
30+ """
31+ Used for automatic question-answering
32+
33+ It works by building a reverse index store that maps
34+ words to an id. To find the indexed questions that contain
35+ a certain the words in the user question, we then take an
36+ intersection of the ids, ranks the questions to pick the best fit,
37+ then select the answer that maps to that question
38+ """
39+
40+ def __init__ (self ):
41+ """
42+ Returns - None
43+ Input - None
44+ ----------
45+ - Initialize database. we use sqlite3
46+ - Check if the tables exist, if not create them
47+ - maintain a class level access to the database
48+ connection object
49+ """
50+ self .conn = sqlite3 .connect ("virtualassistant.sqlite3" , autocommit = True )
51+ cur = self .conn .cursor ()
52+ res = cur .execute ("SELECT name FROM sqlite_master WHERE name='IdToQuesAns'" )
53+ tables_exist = res .fetchone ()
54+
55+ if not tables_exist :
56+ self .conn .execute ("CREATE TABLE IdToQuesAns(id INTEGER PRIMARY KEY, question TEXT, answer TEXT)" )
57+ self .conn .execute ('CREATE TABLE WordToId (name TEXT, value TEXT)' )
58+ cur .execute ("INSERT INTO WordToId VALUES (?, ?)" , ("index" , "{}" ,))
59+
60+ def index_question_answer (self , question , answer ):
61+ """
62+ Returns - string
63+ Input - str: a string of words called question
64+ ----------
65+ Indexes the question and answer. It does this by performing two
66+ operations - add the question and answer to the IdToQuesAns, then
67+ adds the words in the question to WordToId
68+ - takes in the question and answer (str)
69+ - passes the question and answer to a method to add them
70+ to IdToQuesAns
71+ - retrieves the id of the inserted ques-answer
72+ - uses the id to call the method that adds the words of
73+ the question to the reverse index WordToId if the word has not
74+ already been indexed
75+ """
76+ row_id = self ._add_to_IdToQuesAns (question .lower (), answer .lower ())
77+ cur = self .conn .cursor ()
78+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
79+ reverse_idx = json .loads (reverse_idx )
80+ question = question .split ()
81+ for word in question :
82+ if word not in reverse_idx :
83+ reverse_idx [word ] = [row_id ]
84+ else :
85+ if row_id not in reverse_idx [word ]:
86+ reverse_idx [word ].append (row_id )
87+ reverse_idx = json .dumps (reverse_idx )
88+ cur = self .conn .cursor ()
89+ result = cur .execute ("UPDATE WordToId SET value = (?) WHERE name='index'" , (reverse_idx ,))
90+ return ("index successful" )
91+
92+ def _add_to_IdToQuesAns (self , question , answer ):
93+ """
94+ Returns - int: the id of the inserted document
95+ Input - str: a string of words called `document`
96+ ---------
97+ - use the class-level connection object to insert the document
98+ into the db
99+ - retrieve and return the row id of the inserted document
100+ """
101+ cur = self .conn .cursor ()
102+ res = cur .execute ("INSERT INTO IdToQuesAns (question, answer) VALUES (?, ?)" , (question , answer ,))
103+ return res .lastrowid
104+
105+ def find_questions (self , user_input ):
106+ """
107+ Returns - <class method>: the return value of the _find_questions_with_idx method
108+ Input - str: a string of words called `user_input`, expected to be a question
109+ ---------
110+ - retrieve the reverse index
111+ - use the words contained in the user input to find all the idxs
112+ that contain the word
113+ - use idxs to call the _find_questions_with_idx method
114+ - return the result of the called method
115+ """
116+ cur = self .conn .cursor ()
117+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
118+ reverse_idx = json .loads (reverse_idx )
119+ user_input = user_input .split (" " )
120+ all_docs_with_user_input = []
121+ for term in user_input :
122+ if term in reverse_idx :
123+ all_docs_with_user_input .append (reverse_idx [term ])
124+
125+ if not all_docs_with_user_input : # the user_input does not exist
126+ return []
127+
128+ common_idx_of_docs = set (all_docs_with_user_input [0 ])
129+ for idx in all_docs_with_user_input [1 :]:
130+ common_idx_of_docs .intersection_update (idx )
131+
132+ if not common_idx_of_docs : # the user_input does not exist
133+ return []
134+
135+ return self ._find_questions_with_idx (common_idx_of_docs )
136+
137+ def _find_questions_with_idx (self , idxs ):
138+ """
139+ Returns - list[str]: the list of questions with the idxs
140+ Input - list of idxs
141+ ---------
142+ - use the class-level connection object to retrieve the questions that
143+ have the idx in the input list of idxs.
144+ - retrieve and return these questions as a list
145+ """
146+ idxs = list (idxs )
147+ cur = self .conn .cursor ()
148+ sql = "SELECT id, question, answer FROM IdToQuesAns WHERE id in ({seq})" .format (
149+ seq = ',' .join (['?' ]* len (idxs ))
150+ )
151+ result = cur .execute (sql , idxs ).fetchall ()
152+ return (result )
153+
154+ def find_most_matched_question (self , user_input , corpus ):
155+ """
156+ Returns - list[str]: the list of [(score, most_matching_question)]
157+ Input - user_input, and list of matching questions called corpus
158+ ---------
159+ - use the tfidf score to rank the questions and pick the most matching
160+ question
161+ """
162+ vectorizer = TfidfVectorizer ()
163+ tfidf_scores = vectorizer .fit_transform (corpus )
164+ tfidf_array = pd .DataFrame (tfidf_scores .toarray (),columns = vectorizer .get_feature_names_out ())
165+ tfidf_dict = tfidf_array .to_dict ()
166+
167+ user_input = user_input .split (" " )
168+ result = []
169+ for idx in range (len (corpus )):
170+ result .append ([0 , corpus [idx ]])
171+
172+ for term in user_input :
173+ if term in tfidf_dict :
174+ for idx in range (len (result )):
175+ result [idx ][0 ] += tfidf_dict [term ][idx ]
176+ return result [0 ]
177+
178+ def provide_answer (self , user_input ):
179+ """
180+ Returns - str: the answer to the user_input
181+ Input - str: user_input
182+ ---------
183+ - use the user_input to get the list of matching questions
184+ - create a corpus which is a list of all matching questions
185+ - create a question_map that maps questions to their respective answers
186+ - use the user_input and corpus to find the most matching question
187+ - return the answer that matches that question from the question_map
188+ """
189+ matching_questions = self .find_questions (user_input )
190+ corpus = [item [1 ] for item in matching_questions ]
191+ question_map = {question :answer for (id , question , answer ) in matching_questions }
192+ score , most_matching_question = self .find_most_matched_question (user_input , corpus )
193+ return question_map [most_matching_question ]
194+
195+
196+ if __name__ == "__main__" :
197+ va = QuestionAnswerVirtualAssistant ()
198+ va .index_question_answer (
199+ "What are the different types of competitions available on Kaggle" ,
200+ "Types of Competitions Kaggle Competitions are designed to provide challenges for competitors"
201+ )
202+ print (
203+ va .index_question_answer (
204+ "How to form, manage, and disband teams in a competition" ,
205+ "Everyone that competes in a Competition does so as a team. A team is a group of one or more users"
206+ )
207+ )
208+ va .index_question_answer (
209+ "What is Data Leakage" ,
210+ "Data Leakage is the presence of unexpected additional information in the training data"
211+ )
212+ va .index_question_answer (
213+ "How does Kaggle handle cheating" ,
214+ "Cheating is not taken lightly on Kaggle. We monitor our compliance account"
215+ )
216+ print (va .provide_answer ("state Kaggle cheating policy" ))
217+ print (va .provide_answer ("Tell me what is data leakage" ))
0 commit comments