`scipy`#

# For interactive plots
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import TeX
output_notebook()

加载 BokehJS ...

scipy 项目开发的快照。

问题#

query_date = np.datetime64("2020-01-01 00:00:00")

# Load data
with open("devstats-data/scipy_issues.json", "r") as fh:
    issues = [item["node"] for item in json.loads(fh.read())]

glue("devstats-data/scipy_query_date", str(query_date.astype("M8[D]")))

新问题#

1901 个新问题已自 2020-01-01 以来打开，其中 1185 (62%) 已关闭。

在此期间创建和关闭的新问题的平均生命周期为 92 小时。

query_date = np.datetime64("2020-01-01 00:00:00")

# Load data
with open("devstats-data/scipy_issues.json", "r") as fh:
    issues = [item["node"] for item in json.loads(fh.read())]

glue("scipy_query_date", str(query_date.astype("M8[D]")))

响应时间#

显示代码单元格源代码隐藏代码单元格源代码

# Remove issues that are less than a day old for the following analysis
newly_created_day_old = [
    iss for iss in newly_created
    if np.datetime64(datetime.datetime.now()) - np.datetime64(iss["createdAt"])
    > np.timedelta64(1, "D")
]

# TODO: really need pandas here
commented_issues = [
    iss for iss in newly_created_day_old
    if any(
        e["node"]["__typename"] == "IssueComment" for e in iss["timelineItems"]["edges"]
    )
]
first_commenters, time_to_first_comment = [], []
for iss in commented_issues:
    for e in iss["timelineItems"]["edges"]:
        if e["node"]["__typename"] == "IssueComment":
            try:
                user = e["node"]["author"]["login"]
            except TypeError as err:
                # This can happen e.g. when a user deletes their GH acct
                user = "UNKNOWN"
            first_commenters.append(user)
            dt = np.datetime64(e["node"]["createdAt"]) - np.datetime64(iss["createdAt"])
            time_to_first_comment.append(dt.astype("m8[m]"))
            break  # Only want the first commenter
time_to_first_comment = np.array(time_to_first_comment)  # in minutes

median_time_til_first_response = np.median(time_to_first_comment.astype(int) / 60)

cutoffs = [
    np.timedelta64(1, "h"),
    np.timedelta64(12, "h"),
    np.timedelta64(24, "h"),
    np.timedelta64(3, "D"),
    np.timedelta64(7, "D"),
    np.timedelta64(14, "D"),
]
num_issues_commented_by_cutoff = np.array(
    [
        np.sum(time_to_first_comment < cutoff) for cutoff in cutoffs
    ]
)

# TODO: Update IssueComment query to include:
#  - whether the commenter is a maintainer
#  - datetime of comment
# This will allow analysis of what fraction of issues are addressed by
# maintainers vs. non-maintainer, and the distribution of how long an issue
# usually sits before it's at least commented on

glue(
    "scipy_num_new_issues_responded",
    percent_val(len(commented_issues), len(newly_created_day_old))
)

glue("scipy_new_issues_at_least_1_day_old", len(newly_created_day_old))
glue("scipy_median_response_time", f"{median_time_til_first_response:1.0f}")

在至少 24 小时前的 1901 个问题中，1621 (85%) 的问题已被评论。问题首次得到回复的平均时间为 3 小时。

首位回复者#

	贡献者	首次评论次数
282	tupui	173
150	ilayn	160
244	rgommers	146
202	mdhaber	140
82	andyfaff	88
70	WarrenWeckesser	62
245	rkern	57
283	tylerjereddy	53
129	ev-br	48
235	pv	36

拉取请求#

合并的 PR 随时间推移#

查看合并的 PR 随时间推移的情况。

显示代码单元格源代码隐藏代码单元格源代码

# All contributors
merged_prs = [pr for pr in prs if pr['state'] == 'MERGED']
merge_dates = np.array([pr['mergedAt'] for pr in merged_prs], dtype=np.datetime64)
binsize = np.timedelta64(30, "D")
date_bins = np.arange(merge_dates[0], merge_dates[-1], binsize)
h_all, bedges = np.histogram(merge_dates, date_bins)
bcenters = bedges[:-1] + binsize / 2
smoothing_interval = 4  # in units of bin-width

# First-time contributors
first_time_contributor = []
prev_contrib = set()
for record in merged_prs:
    try:
        author = record['author']['login']
    except TypeError:  # Author no longer has GitHub account
        first_time_contributor.append(None)
        continue
    if author not in prev_contrib:
        first_time_contributor.append(True)
        prev_contrib.add(author)
    else:
        first_time_contributor.append(False)
# Object dtype for handling None
first_time_contributor = np.array(first_time_contributor, dtype=object)
# Focus on first time contributors
ftc_mask = first_time_contributor == True
ftc_dates = merge_dates[ftc_mask]

h_ftc, bedges = np.histogram(ftc_dates, date_bins)

fig, axes = plt.subplots(1, 2, figsize=(16, 8))
for ax, h, whom in zip(
    axes.ravel(), (h_all, h_ftc), ("all contributors", "first-time contributors")
):
    ax.bar(bcenters, h, width=binsize, label="Raw")
    ax.plot(
        bcenters,
        np.convolve(h, np.ones(smoothing_interval), 'same') / smoothing_interval,
        label=f"{binsize * smoothing_interval} moving average",
        color='tab:orange',
        linewidth=2.0,
    )

    ax.set_title(f'{whom}')
    ax.legend()

fig.suptitle("Merged PRs from:")
axes[0].set_xlabel('Time')
axes[0].set_ylabel(f'# Merged PRs / {binsize} interval')
axes[1].set_ylim(axes[0].get_ylim())
fig.autofmt_xdate()

# TODO: Replace this with `glue` once the glue:figure directive supports
# alt-text
import os
os.makedirs("thumbs", exist_ok=True)
plt.savefig("thumbs/scipy.png", bbox_inches="tight")

../../_images/4e0c7c5b54b35502dea3d875fc4a522b5f5ff25f2383dbcf952454fb93721b00.png

PR 生命周期#

下图显示了 PR 随时间推移的“生存”情况。这意味着，该图显示了有多少 PR 保持打开状态至少这么多天。这被分成已合并的 PR 和仍然打开的 PR（当前不包括已关闭但未合并的 PR）。

显示代码单元格源代码隐藏代码单元格源代码

merged_prs = [pr for pr in prs if pr['state'] == 'MERGED']
lifetimes_merged = np.array(
    [isoparse(pr["mergedAt"]) - isoparse(pr["createdAt"]) for pr in merged_prs],
    dtype="m8[m]").view("int64") / (60 * 24)  # days
lifetimes_merged.sort()

#closed_prs = [pr for pr in prs if pr['state'] == 'CLOSED']
#lifetimes_closed = np.array(
#    [isoparse(pr["mergedAt"]) - isoparse(pr["createdAt"]) for pr in closed_prs],
#    dtype="m8[m]").view("int64") / (60 * 24)  # days
#lifetimes_closed.sort()


# Use the newest issue to guess a time when the data was generated.
# Can this logic be improved?
current_time = isoparse(max(iss["createdAt"] for iss in issues))

open_prs = [pr for pr in prs if pr['state'] == 'OPEN']
age_open = np.array(
    [current_time - isoparse(pr["createdAt"]) for pr in open_prs],
    dtype="m8[m]").view("int64") / (60 * 24)  # days
age_open.sort()

fig, ax = plt.subplots(figsize=(6, 4))
number_merged = np.arange(1, len(lifetimes_merged)+1)[::-1]
ax.step(lifetimes_merged, number_merged, label="Merged")

#ax.step(lifetimes_closed, np.arange(1, len(lifetimes_closed)+1)[::-1])

number_open = np.arange(1, len(age_open)+1)[::-1]
ax.step(age_open, number_open, label="Open")

# Find the first point where closed have a bigger survival than open PRs:
all_lifetimes = np.concatenate([lifetimes_merged, age_open])
all_lifetimes.sort()

number_merged_all_t = np.interp(all_lifetimes, lifetimes_merged, number_merged)
number_open_all_t = np.interp(all_lifetimes, age_open, number_open)

first_idx = np.argmax(number_merged_all_t < number_open_all_t)
first_time = all_lifetimes[first_idx]

ax.vlines(
        [first_time], 0, 1, transform=ax.get_xaxis_transform(), colors='k',
        zorder=0, linestyle="--")

ax.annotate(
    f"{round(first_time)} days",
    xy=(first_time, number_open_all_t[first_idx]),
    xytext=(5, 5), textcoords="offset points",
    va="bottom", ha="left")

ax.legend()
ax.set_xlabel("Time until merged or time open [days]")
ax.set_ylabel(r"# of PRs open this long or longer")
ax.set_xscale("log")
fig.autofmt_xdate()
fig.tight_layout();

../../_images/ee4ba643714a21257a27a02f79c54bacda88e3a2eccc81e804298c767ee3bf00.png

打开的 PR 的可合并性#

/opt/buildhome/python3.8/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3508: UserWarning: 

The data contains PRs with unknown merge status.
Please re-download the data to get accurate info about PR mergeability.
  exec(code_obj, self.user_global_ns, self.user_ns)

../../_images/0b97cddf2d433e70c1541d7ca8e3cdd45a981fd0fd1a97da810e5883ae964d08.png

PR 参与者人数#

贡献来源#

总共有 5818 个已合并的 PR[1] 由 1036 位唯一作者提交。其中 675 (65%) 是“飞过”PR，即来自仅为该项目贡献过一次（迄今为止）的用户提交的 PR。

小马因子#

查看这些数据的另一种方法是根据小马因子，其描述如下：

贡献者中，其总贡献构成大多数贡献的最小人数。

对于此分析，我们将合并的 PR 视为贡献的指标。考虑到项目生命周期内所有合并的 PR，小马因子为：16。

../../_images/5caac62e2d1834d862473507a0ceda135fb93dc2d892b761c0210084d4bf7f67.png

scipy#

问题#

新问题#

响应时间#

首位回复者#

拉取请求#

合并的 PR 随时间推移#

PR 生命周期#

打开的 PR 的可合并性#

PR 参与者人数#

贡献来源#

小马因子#

`scipy`#