Skip to content

Commit cc32cdc

Browse files
authored
fix erd csv load update (#111)
* fix ERD * update changelog
1 parent 5dfeda4 commit cc32cdc

File tree

3 files changed

+82
-30
lines changed

3 files changed

+82
-30
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
4848
- Fixed torch dependency version constraints in pyproject.toml
4949
- Fixed plotly installation in dependencies
5050
- Fixed detect entity networks prepare_model edge case handling
51+
- Fixed CSV file upload persistence in Extract Record Data workflow: uploaded CSV files and dataframes now persist when navigating between tabs by caching files and dataframes in session state and preserving the extraction mode selection
5152
- Fixed Detect Case Patterns to gracefully handle cases with no converging patterns, returning empty DataFrame instead of single NaN row
5253
- Added comprehensive unit tests for empty pattern detection scenario
5354
- Fixed Detect Case Patterns `compute_attribute_counts()` to handle missing columns gracefully with warning message

app/util/ui_components.py

Lines changed: 63 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -266,22 +266,31 @@ def multi_csv_uploader(
266266

267267
if f"{key}_uploader_index" not in st.session_state:
268268
st.session_state[f"{key}_uploader_index"] = str(random.randint(0, 100))
269+
if f"{key}_cached_files" not in st.session_state:
270+
st.session_state[f"{key}_cached_files"] = {}
271+
if f"{key}_cached_dfs" not in st.session_state:
272+
st.session_state[f"{key}_cached_dfs"] = {}
273+
269274
files = st.file_uploader(
270275
upload_label,
271276
type=["csv"],
272277
accept_multiple_files=True,
273278
key=key + "_file_uploader_" + st.session_state[f"{key}_uploader_index"],
274279
)
275-
file_names = [file.name for file in files] if files is not None else []
276-
uploaded_files_var.value = [v for v in uploaded_files_var.value if v in file_names]
277280
if files is not None:
281+
file_names = [file.name for file in files]
278282
for file in files:
279283
if file.name not in uploaded_files_var.value:
280-
uploaded_files_var.value.append(file.name)
284+
current_files = list(uploaded_files_var.value)
285+
current_files.append(file.name)
286+
uploaded_files_var.value = current_files
287+
st.session_state[f"{key}_cached_files"][file.name] = file
281288
last_selected_file = st.session_state.get(f"{key}_last_selected_file", None)
289+
last_selected_df = st.session_state.get(f"{key}_last_selected_df", None)
290+
282291
selected_file = st.selectbox(
283292
"Select a file to process",
284-
options=uploaded_files_var.value if files else [],
293+
options=uploaded_files_var.value,
285294
key=f"{key}_file_select",
286295
)
287296
changed = selected_file != last_selected_file
@@ -290,55 +299,80 @@ def multi_csv_uploader(
290299
with col1:
291300
encoding = st.selectbox(
292301
"File encoding",
293-
disabled=len(files) == 0,
302+
disabled=len(uploaded_files_var.value) == 0,
294303
options=FILE_ENCODING_OPTIONS,
295304
key=f"{key}_encoding_db",
296305
index=FILE_ENCODING_OPTIONS.index(st.session_state[f"{key}_encoding"]),
297306
)
298307
with col2:
299308
st.number_input(
300309
"Maximum rows to process (0 = all)",
301-
disabled=len(files) == 0,
310+
disabled=len(uploaded_files_var.value) == 0,
302311
min_value=0,
303312
step=1000,
304313
key=max_rows_var.key,
305314
)
306315
with col3:
307316
st.text("")
308317
st.text("")
309-
reload = st.button("Reload", key=f"{key}_reload", disabled=len(files) == 0)
318+
reload = st.button("Reload", key=f"{key}_reload", disabled=len(uploaded_files_var.value) == 0)
310319

311320
selected_df = pd.DataFrame()
312-
if selected_file not in [None, ""] or reload:
321+
cache_key = f"{selected_file}_{encoding}_{max_rows_var.value}"
322+
323+
if selected_file not in [None, ""] and (changed or reload or cache_key not in st.session_state[f"{key}_cached_dfs"]):
313324
st.session_state[f"{key}_encoding"] = encoding
314-
for file in files:
315-
if file.name == selected_file:
316-
selected_df = (
317-
pd.read_csv(
318-
file,
319-
encoding=encoding,
320-
nrows=max_rows_var.value,
321-
encoding_errors="ignore",
322-
low_memory=False,
323-
)
324-
if max_rows_var.value > 0
325-
else pd.read_csv(
326-
file,
327-
encoding=encoding,
328-
encoding_errors="ignore",
329-
low_memory=False,
330-
)
325+
file_to_read = None
326+
327+
if files is not None:
328+
for file in files:
329+
if file.name == selected_file:
330+
file_to_read = file
331+
break
332+
333+
if file_to_read is None and selected_file in st.session_state[f"{key}_cached_files"]:
334+
file_to_read = st.session_state[f"{key}_cached_files"][selected_file]
335+
336+
if file_to_read is not None:
337+
try:
338+
file_to_read.seek(0)
339+
except (AttributeError, OSError):
340+
pass
341+
342+
selected_df = (
343+
pd.read_csv(
344+
file_to_read,
345+
encoding=encoding,
346+
nrows=max_rows_var.value,
347+
encoding_errors="ignore",
348+
low_memory=False,
331349
)
332-
selected_df.columns = [
333-
clean_for_column_name(col) for col in selected_df.columns
334-
]
335-
break
350+
if max_rows_var.value > 0
351+
else pd.read_csv(
352+
file_to_read,
353+
encoding=encoding,
354+
encoding_errors="ignore",
355+
low_memory=False,
356+
)
357+
)
358+
selected_df.columns = [
359+
clean_for_column_name(col) for col in selected_df.columns
360+
]
361+
st.session_state[f"{key}_cached_dfs"][cache_key] = selected_df
362+
elif selected_file not in [None, ""] and cache_key in st.session_state[f"{key}_cached_dfs"]:
363+
selected_df = st.session_state[f"{key}_cached_dfs"][cache_key]
364+
elif (selected_file in [None, ""] or len(uploaded_files_var.value) == 0) and last_selected_df is not None and len(last_selected_df) > 0:
365+
selected_df = last_selected_df
366+
367+
if selected_df is not None and len(selected_df) > 0:
336368
st.dataframe(
337369
selected_df[:show_rows],
338370
hide_index=True,
339371
use_container_width=True,
340372
height=height,
341373
)
374+
st.session_state[f"{key}_last_selected_file"] = selected_file
375+
st.session_state[f"{key}_last_selected_df"] = selected_df
342376
changed = changed or reload
343377
return selected_file, selected_df, changed
344378

app/workflows/extract_record_data/workflow.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,24 @@ async def create(sv: variables.SessionVariables, workflow: None):
4444
st.warning("Prepare data schema to continue.")
4545
else:
4646
st.markdown("##### Record extraction controls")
47-
mode = st.radio("Mode", ["Extract from single text", "Extract from rows of CSV file"], horizontal=True)
47+
if f"{workflow}_extraction_mode" not in st.session_state:
48+
st.session_state[f"{workflow}_extraction_mode"] = 0 # 0 = single text, 1 = CSV
49+
50+
mode_index = st.radio(
51+
"Mode",
52+
["Extract from single text", "Extract from rows of CSV file"],
53+
horizontal=True,
54+
index=st.session_state[f"{workflow}_extraction_mode"],
55+
key=f"{workflow}_extraction_mode_radio"
56+
)
57+
58+
mode_index_map = {"Extract from single text": 0, "Extract from rows of CSV file": 1}
59+
reverse_map = {0: "Extract from single text", 1: "Extract from rows of CSV file"}
60+
if st.session_state[f"{workflow}_extraction_mode"] != mode_index_map[mode_index]:
61+
st.session_state[f"{workflow}_extraction_mode"] = mode_index_map[mode_index]
62+
st.rerun()
63+
mode = mode_index
64+
4865
input_texts = []
4966
if mode == "Extract from single text":
5067
st.text_area("Unstructured text input", key=sv.text_input.key, height=400)

0 commit comments

Comments
 (0)