diff --git a/pypistats/templates/package.html b/pypistats/templates/package.html index 2021003..78c7747 100644 --- a/pypistats/templates/package.html +++ b/pypistats/templates/package.html @@ -74,6 +74,8 @@

{{ package }}


Downloads last month: {{ "{:,.0f}".format(recent['month']) }} +
+ 7-day smoothing

{% endblock %} diff --git a/pypistats/views/general.py b/pypistats/views/general.py index def523f..81c3b21 100644 --- a/pypistats/views/general.py +++ b/pypistats/views/general.py @@ -107,7 +107,7 @@ def package_page(package): recent[r.category] = r.downloads # PyPI metadata - metadata = None + metadata = dict() if package != "__all__": try: metadata = requests.get(f"https://pypi.python.org/pypi/{package}/json", timeout=5).json() @@ -139,8 +139,13 @@ def package_page(package): else: metrics = ["downloads", "percentages"] + use_smoothing = metadata['use_smoothing'] = request.args.get('smooth', None) is not None for metric in metrics: - model_data.append({"metric": metric, "name": model.__tablename__, "data": data_function[metric](records)}) + model_data.append({ + "metric": metric, + "name": model.__tablename__, + "data": data_function[metric](records, use_smoothing=use_smoothing), + }) # Build the plots plots = [] @@ -191,7 +196,20 @@ def package_page(package): return render_template("package.html", package=package, plots=plots, metadata=metadata, recent=recent, user=g.user) -def get_download_data(records): +def smooth_data(data, window=7): + # Ensure data is sorted by date + data["x"], data["y"] = zip(*[(x, y) for x, y in sorted( + zip(data["x"], data["y"]), key=lambda pair: pair[0])]) + # Smooth data with a trailing window, so recent days are as accurate as possible + smoothed_data = deepcopy(data) + smoothed_data["y"] = list(smoothed_data["y"]) + for i in range(window, len(data["y"])): + window_data = data["y"][max(0, i - window):i] + smoothed_data["y"][i] = sum(window_data) / len(window_data) + return smoothed_data + + +def get_download_data(records, use_smoothing=False): """Organize the data for the absolute plots.""" data = defaultdict(lambda: {"x": [], "y": []}) @@ -241,54 +259,33 @@ def get_download_data(records): if category not in date_categories: data[category]["x"].append(str(records[-1].date)) data[category]["y"].append(0) - return data - - -def get_proportion_data(records): - """Organize the data for the fill plots.""" - data = defaultdict(lambda: {"x": [], "y": [], "text": []}) - - date_categories = defaultdict(lambda: 0) - all_categories = [] - - prev_date = records[0].date - - for record in records: - if record.category not in all_categories: - all_categories.append(record.category) - - all_categories = sorted(all_categories) - for category in all_categories: - data[category] # set the dict value (keeps it ordered) - for record in records: - if record.date != prev_date: + if use_smoothing: + # Smooth data using a 7-day window + for category in all_categories: + data[category] = smooth_data(data[category]) - total = sum(date_categories.values()) / 100 - for category in all_categories: - data[category]["x"].append(str(prev_date)) - value = date_categories[category] / total - data[category]["y"].append(value) - data[category]["text"].append("{0:.2f}%".format(value) + " = {:,}".format(date_categories[category])) + return data - date_categories = defaultdict(lambda: 0) - prev_date = record.date - # Track categories for this date - date_categories[record.category] = record.downloads - else: - # Fill in missing final date with zeros - total = sum(date_categories.values()) / 100 - for category in all_categories: - if category not in date_categories: - data[category]["x"].append(str(records[-1].date)) - data[category]["y"].append(0) - data[category]["text"].append("{0:.2f}%".format(0) + " = {:,}".format(0)) - else: - data[category]["x"].append(str(records[-1].date)) - value = date_categories[category] / total - data[category]["y"].append(value) - data[category]["text"].append("{0:.2f}%".format(value) + " = {:,}".format(date_categories[category])) +def get_proportion_data(records, use_smoothing=False): + """Organize the data for the fill plots.""" + # Get the absolute numbers as a starting point, to handle fills etc. + # Note that this means we smooth *then* calculate proportions, which + # is the correct order to avoid inflating random noise. + data = get_download_data(records, use_smoothing=use_smoothing) + + # Calculate the per-day sum of all y-values to divide by. + all_ys = [category_values["y"] for category_values in data.values()] + totals = [sum(chunk) or 1 for chunk in zip(*all_ys)] + + # and finally divide each category by totals and add detailed labels + for category_values in data.values(): + ys = category_values["y"] + category_values["y"] = [y / t for y, t in zip(ys, totals)] + category_values["text"] = [ + "{:.2f}% = {:,}".format(p, a) for p, a in zip(ys, category_values["y"]) + ] return data