Skip to content

Commit ce0b14a

Browse files
authored
Merge pull request #151 from AliSoua/main
add timeout to requests and optimize directory traversal
2 parents 755e767 + 93b7abc commit ce0b14a

File tree

1 file changed

+18
-6
lines changed

1 file changed

+18
-6
lines changed

utils/crawl_github_files.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def fetch_branches(owner: str, repo: str):
144144
"""Get brancshes of the repository"""
145145

146146
url = f"https://api.github.com/repos/{owner}/{repo}/branches"
147-
response = requests.get(url, headers=headers)
147+
response = requests.get(url, headers=headers, timeout=(30, 30))
148148

149149
if response.status_code == 404:
150150
if not token:
@@ -165,7 +165,7 @@ def check_tree(owner: str, repo: str, tree: str):
165165
"""Check the repository has the given tree"""
166166

167167
url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{tree}"
168-
response = requests.get(url, headers=headers)
168+
response = requests.get(url, headers=headers, timeout=(30, 30))
169169

170170
return True if response.status_code == 200 else False
171171

@@ -216,7 +216,7 @@ def fetch_contents(path):
216216
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
217217
params = {"ref": ref} if ref != None else {}
218218

219-
response = requests.get(url, headers=headers, params=params)
219+
response = requests.get(url, headers=headers, params=params, timeout=(30, 30))
220220

221221
if response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
222222
reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
@@ -276,7 +276,7 @@ def fetch_contents(path):
276276
# For files, get raw content
277277
if "download_url" in item and item["download_url"]:
278278
file_url = item["download_url"]
279-
file_response = requests.get(file_url, headers=headers)
279+
file_response = requests.get(file_url, headers=headers, timeout=(30, 30))
280280

281281
# Final size check in case content-length header is available but differs from metadata
282282
content_length = int(file_response.headers.get('content-length', 0))
@@ -292,7 +292,7 @@ def fetch_contents(path):
292292
print(f"Failed to download {rel_path}: {file_response.status_code}")
293293
else:
294294
# Alternative method if download_url is not available
295-
content_response = requests.get(item["url"], headers=headers)
295+
content_response = requests.get(item["url"], headers=headers, timeout=(30, 30))
296296
if content_response.status_code == 200:
297297
content_data = content_response.json()
298298
if content_data.get("encoding") == "base64" and "content" in content_data:
@@ -312,7 +312,19 @@ def fetch_contents(path):
312312
print(f"Failed to get content for {rel_path}: {content_response.status_code}")
313313

314314
elif item["type"] == "dir":
315-
# Recursively process subdirectories
315+
# OLD IMPLEMENTATION (comment this block to test new implementation)
316+
# Always recurse into directories without checking exclusions first
317+
# fetch_contents(item_path)
318+
319+
# NEW IMPLEMENTATION (uncomment this block to test optimized version)
320+
# # Check if directory should be excluded before recursing
321+
if exclude_patterns:
322+
dir_excluded = any(fnmatch.fnmatch(item_path, pattern) or
323+
fnmatch.fnmatch(rel_path, pattern) for pattern in exclude_patterns)
324+
if dir_excluded:
325+
continue
326+
327+
# # Only recurse if directory is not excluded
316328
fetch_contents(item_path)
317329

318330
# Start crawling from the specified path

0 commit comments

Comments
 (0)