import re
import csv
from collections import Counter

import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn

raw_data_str = requests.get('https://raw.githubusercontent.com/TheSCInitiative/BOINC/main/BOINC_Census/2022/data_source/BOINC_Census_2022_results_raw_cleaned.csv').text
data_str = requests.get('https://raw.githubusercontent.com/TheSCInitiative/BOINC/main/BOINC_Census/2022/data_source/BOINC_Census_2022_results_completed.csv').text
raw_data = [row for row in csv.reader(raw_data_str.split('\n')) if bool(row)]
data = [row for row in csv.reader(data_str.split('\n')) if bool(row)]
headers = data[0]

# Remove headers from data
raw_data = raw_data[1:]
data = data[1:]

# Eye candy settings
BINARY_COLOURS = ['#38CF62', '#E65050']
GENERAL_COLOURS = seaborn.color_palette("colorblind")
BOINC_BLUE = '#163E72'

def get_col(heading: str, data_source: list):
    idx = headers.index(heading)
    return [row[idx] for row in data_source]

def multi_sel_to_hist(mult_sel_col: list[str]):
    '''
    Converts a column with CSV values in each cell to a simple list that can
    be turned into a histogram.
    
    Fillout does not perform escaping on internal commas so we have to handle it.
    Because I always put a space after a comma, we can safely replace it.
    '''
    return sum([row.replace(', ', ' ').split(',') for row in mult_sel_col], start=[])


            
              print(f'Total Submissions: {len(raw_data)}')
print(f'Completed Submissions: {len(data)}')

fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)

used_boinc_counter = Counter(get_col('Do you or have you ever used BOINC?', data))

ax[0].pie([len(data), len(raw_data) - len(data)], labels=['Completed', 'Not Completed'], autopct='%.0f%%', colors=BINARY_COLOURS)
ax[1].pie([used_boinc_counter['Yes'], used_boinc_counter['No']], labels=['Yes', 'No'], autopct='%.0f%%', colors=BINARY_COLOURS)
ax[0].title.set_text('Form Completion Rate')
ax[1].title.set_text('Do you or have you ever used BOINC?')
plt.show()

Total Submissions: 1119
Completed Submissions: 939


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)
gender_groups = Counter(get_col('What gender do you identify as?', data)).items()
age_groups = Counter(get_col('What is your age?', data))
age_groups_sorted = sorted(age_groups.items(), key=lambda v: ord(v[0][0]) + 999 * int(v[0].endswith('+'))) # Ensure that 'more than' values are put to the end

ax[0].bar([a[0] for a in gender_groups], [a[1] for a in gender_groups], color=BOINC_BLUE)
ax[1].bar([a[0] for a in age_groups_sorted], [a[1] for a in age_groups_sorted], color=BOINC_BLUE)
ax[0].title.set_text('What gender do you identify as?')
ax[1].title.set_text('What is your age?')

plt.show()


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)
edu_groups = Counter(get_col('What is your highest level of education you have completed?', data))
country_groups = Counter(get_col('What country are you from?', data))
# Take the top 5 sources to prevent clutter
country_groups = sorted(country_groups.items(), key=lambda x: x[1], reverse=True)[:5]
# Combine none and other
edu_groups['None or Other'] = edu_groups['No Education'] + edu_groups['Other']
edu_groups.pop('No Education')
edu_groups.pop('Other')

repl_dict = {
    'United States of America' : 'US',
    'United Kingdom' : 'UK',
    'Tertiary Education (postgraduate)' : 'Uni Postgrad', 
    'Tertiary Education (undergraduate)' : 'Uni Undergrad', 
    'Secondary Education' : 'Secondary', 
    'Primary Education' : 'Primary'
}

edu_groups = edu_groups.items()

ax[0].bar([repl_dict.get(a[0], a[0]) for a in edu_groups], [a[1] for a in edu_groups], color=BOINC_BLUE)
ax[1].bar([repl_dict.get(a[0], a[0]) for a in country_groups], [a[1] for a in country_groups], color=BOINC_BLUE)
ax[0].title.set_text('What is your highest level of education?')
ax[1].title.set_text('What country are you from? (top-five)')

plt.show()


            
              fig, ax = plt.subplots(1, 3)
fig.set_figwidth(15)
computers = Counter(get_col('How many computers do you have that run BOINC regularly?', data))
computers_sorted = sorted(computers.items(), key=lambda v: ord(v[0][0]) + 999 * int(v[0].endswith('+'))) # Ensure that 'more than' values are put to the end
mobiles = Counter(get_col('How many of the above computers are mobile devices? (mobile phones or tablets)', data))
mobiles_sorted = sorted(mobiles.items(), key=lambda v: ord(v[0][0]) + 999 * int(v[0].endswith('+'))) # Ensure that 'more than' values are put to the end

ax[0].bar([c[0] for c in computers_sorted], [c[1] for c in computers_sorted], color=BOINC_BLUE)
ax[1].bar([c[0] for c in mobiles_sorted], [c[1] for c in mobiles_sorted], color=BOINC_BLUE)
ax[2].pie([len(data) - mobiles['0'], mobiles['0']], labels=['BOINC on mobile', 'No BOINC mobile'], autopct='%.0f%%', colors=BINARY_COLOURS)
ax[0].title.set_text('How many devices do you have running BOINC?')
ax[1].title.set_text('How many are mobile devices?')
ax[2].title.set_text('BOINC on mobile rate')

plt.show()


            
              boinc_often = Counter(get_col('How often do you run BOINC?', data)).items()

repl_dict = {
    'A few days of the week' : 'Few days/wk',
    'All the time (24/7)' : '24/7',
    'Most days of the week' : 'Most days', 
    'A few hours every week' : '<1day/wk', 
    "I don't use BOINC" : 'N/A', 
    "Only when I'm not using the computer" : 'Only Idle'
}

plt.bar([repl_dict.get(a[0], a[0]) for a in boinc_often], [a[1] for a in boinc_often], color=BOINC_BLUE)
plt.title('How often do you run BOINC?')
plt.show()


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)
contrib_impact = Counter(get_col('How much impact do you think your BOINC contribution is having?', data))
contrib_impact.pop('') # Empty responses were recorded due to an error in the form. This occurred 13 times.
contrib_impact = contrib_impact.items()
energy_src = Counter(get_col('What source do you get most of your electricity from?', data)).items()

ax[0].bar([a[0] for a in contrib_impact], [a[1] for a in contrib_impact], color=BOINC_BLUE)
ax[1].bar([re.sub(r' \(.*\)', '', a[0]) for a in energy_src], [a[1] for a in energy_src], color=BOINC_BLUE)
ax[0].title.set_text('What impact do you think your contribution is having?')
ax[1].title.set_text('What source do you get most of your electricity from?')

plt.show()


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)

financially = list(Counter(get_col('Have you ever supported a BOINC project financially?', data)).items())
financially.sort(reverse=True)
non_financially = list(Counter(get_col('Aside from running BOINC, have you ever supported a BOINC project non-financially? (eg, donating computer hardware, volunteering time)', data)).items())
non_financially.sort(reverse=True)

ax[0].pie([a[1] for a in financially], labels=[a[0] for a in financially], autopct='%.0f%%', colors=BINARY_COLOURS)
ax[1].pie([a[1] for a in non_financially], labels=[a[0] for a in non_financially], autopct='%.0f%%', colors=BINARY_COLOURS)
ax[0].title.set_text('Supported a BOINC project financially')
ax[1].title.set_text('Supported a BOINC project non-financially')

plt.show()


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)
when_boinc = sorted(Counter(get_col('What year did you first hear about BOINC?', data)).items())
where_boinc = Counter(get_col('How did you first hear about BOINC?', data))
# Take the top 5 sources to prevent clutter
where_boinc_top = sorted(where_boinc.items(), key=lambda x: x[1], reverse=True)[:5]
repl_dict = {
    'Via a Google search or other search engine' : 'Web search',
    'From a news article (both print or digital)' : 'News',
    'Word-of-mouth/a friend told me' : 'Friend'
}

ax[0].bar([a[0][2:] for a in when_boinc], [a[1] for a in when_boinc], color=BOINC_BLUE)
ax[1].bar([repl_dict.get(a[0], a[0]) for a in where_boinc_top], [a[1] for a in where_boinc_top], color=BOINC_BLUE)
ax[0].title.set_text('What year did you first hear about BOINC?')
ax[1].title.set_text('How did you first hear about BOINC? (top-five)')

plt.show()


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)
where_help = Counter(multi_sel_to_hist(get_col('Where do you seek help for problems you have with running BOINC?', data)))
where_hear = Counter(multi_sel_to_hist(get_col('In what ways would you like to hear about what BOINC projects are doing?', data)))
# Take the top 5 options to prevent clutter
where_help_top = sorted(where_help.items(), key=lambda x: x[1], reverse=True)[:5]
where_hear_top = sorted(where_hear.items(), key=lambda x: x[1], reverse=True)[:5]

repl_dict = {
    'BOINC project forums/message boards' : 'Proj. forums',
    'Google search or other search engine' : 'Web search',
    'The official BOINC forum/message board' : 'BOINC forum',
    'The BOINC website' : 'BOINC website',
    'Chat groups like Discord or Telegram' : 'Chat groups',
    'Via the BOINC manager' : 'BOINC manager',
    'Via a web page or dashboard' : 'Web page',
    'Email notifications' : 'Email',
    'Forum/message board posts' : 'Forum post'
}

ax[0].bar([repl_dict.get(a[0], a[0]) for a in where_help_top], [a[1] for a in where_help_top], color=BOINC_BLUE)
ax[1].bar([repl_dict.get(a[0], a[0]) for a in where_hear_top], [a[1] for a in where_hear_top], color=BOINC_BLUE)
ax[0].title.set_text('Where do you seek help for BOINC? (top-five)')
ax[1].title.set_text('How do you want to hear about BOINC? (top-five)')

plt.show()


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)
aspects = Counter(multi_sel_to_hist(get_col('What are the most important aspects of a BOINC project?', data)))
science = Counter(multi_sel_to_hist(get_col('What fields of science would you like to support with BOINC?', data)))
# Take the top 5 options to prevent clutter
aspects_top = sorted(aspects.items(), key=lambda x: x[1], reverse=True)[:5]
science_top = sorted(science.items(), key=lambda x: x[1], reverse=True)[:5]

repl_dict = {
    'The work they do is important and beneficial to the world' : 'Work impact',
    'Regularly publishes results/academic papers' : 'Freq. results',
    'Active project staff' : 'Active staff',
    'Space astronomy and astrophysics' : 'Space',
    'Open-source (both code and results)' : 'Open-source',
    'Health science medicine anatomy and the human body' : 'Anatomy',
    'Biology microbiology agriculture and living organisms' : 'Biology',
    'Theoretical particle nuclear and quantum physics' : 'Physics',
    'Mathematics and computer science' : 'Maths'
}

ax[0].bar([repl_dict.get(a[0], a[0]) for a in aspects_top], [a[1] for a in aspects_top], color=BOINC_BLUE)
ax[1].bar([repl_dict.get(a[0], a[0]) for a in science_top], [a[1] for a in science_top], color=BOINC_BLUE)
ax[0].title.set_text('What are the important aspects of a project? (top-five)')
ax[1].title.set_text('What fields of science do you want to support? (top-five)')

plt.show()


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)
boinc_proj = list(Counter(get_col('Have you ever created a BOINC project or wanted to create one?', data)).items())
boinc_proj.sort(reverse=True)
boinc_diff = Counter(multi_sel_to_hist(get_col('What are the biggest difficulties in making a BOINC project?', data)))
# This was a hidden question so we got a lot of empty results
boinc_diff.pop('')
# Take the top 5 sources to prevent clutter
boinc_diff = sorted(boinc_diff.items(), key=lambda x: x[1], reverse=True)[:5]
repl_dict = {
    'Technical skills' : 'Tech skills',
    'Time commitments' : 'Time',
    'Server hosting/Computer hardware' : 'Hosting',
    'Inadequate documentation' : 'Documentation',
    'Finding applicable data-set' : 'Data'
}

ax[0].pie([a[1] for a in boinc_proj], labels=[a[0] for a in boinc_proj], autopct='%.0f%%', colors=BINARY_COLOURS)
ax[1].bar([repl_dict.get(a[0], a[0]) for a in boinc_diff], [a[1] for a in boinc_diff], color=BOINC_BLUE)
ax[0].title.set_text('Have you ever wanted to start a BOINC project?')
ax[1].title.set_text('What are the biggest difficulties? (top-five)')

plt.show()


            
              fig, ax = plt.subplots(1, 2)
fig.set_figwidth(15)

compensation = Counter(get_col('Would you use BOINC more if you were compensated for your crunching?', data)).items()
reward_projects = list(Counter(get_col('Are you aware of any organizations, projects or blockchains that reward people for crunching BOINC?', data)).items())
reward_projects.sort(reverse=True)

ax[0].pie([a[1] for a in compensation], labels=[a[0] for a in compensation], autopct='%.0f%%', colors=GENERAL_COLOURS)
ax[1].pie([a[1] for a in reward_projects], labels=[a[0] for a in reward_projects], autopct='%.0f%%', colors=BINARY_COLOURS)
ax[0].title.set_text('Would you BOINC more if you were paid?')
ax[1].title.set_text('Do you know of ways to be paid for crunching?')

plt.show()


            
              fig, ax = plt.subplots(1, 1)

other_dcn = Counter(multi_sel_to_hist(get_col('Do you use any other distributed computing platforms?', data))).items()

ax.bar([a[0] for a in other_dcn], [a[1] for a in other_dcn], color=BOINC_BLUE)
ax.title.set_text('Do you use any other distributed computing platforms?')

plt.show()

Results of the BOINC Census 2022¶

Introduction¶

Basic Statistics¶

Demographics¶

BOINC usage¶

BOINC Impact¶

BOINC Origins¶

Project Administration¶

Miscellaneous¶

Conclusion¶

Summary¶

Improvements for next Census¶

Get ready for the next Census in November 2023!¶

Links¶