cmark

My personal build of CMark ✏️

statistics.py (17859B)

  1 ##  Module statistics.py
  2 ##
  3 ##  Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
  4 ##
  5 ##  Licensed under the Apache License, Version 2.0 (the "License");
  6 ##  you may not use this file except in compliance with the License.
  7 ##  You may obtain a copy of the License at
  8 ##
  9 ##  http://www.apache.org/licenses/LICENSE-2.0
 10 ##
 11 ##  Unless required by applicable law or agreed to in writing, software
 12 ##  distributed under the License is distributed on an "AS IS" BASIS,
 13 ##  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 ##  See the License for the specific language governing permissions and
 15 ##  limitations under the License.
 16 
 17 
 18 """
 19 Basic statistics module.
 20 
 21 This module provides functions for calculating statistics of data, including
 22 averages, variance, and standard deviation.
 23 
 24 Calculating averages
 25 --------------------
 26 
 27 ==================  =============================================
 28 Function            Description
 29 ==================  =============================================
 30 mean                Arithmetic mean (average) of data.
 31 median              Median (middle value) of data.
 32 median_low          Low median of data.
 33 median_high         High median of data.
 34 median_grouped      Median, or 50th percentile, of grouped data.
 35 mode                Mode (most common value) of data.
 36 ==================  =============================================
 37 
 38 Calculate the arithmetic mean ("the average") of data:
 39 
 40 >>> mean([-1.0, 2.5, 3.25, 5.75])
 41 2.625
 42 
 43 
 44 Calculate the standard median of discrete data:
 45 
 46 >>> median([2, 3, 4, 5])
 47 3.5
 48 
 49 
 50 Calculate the median, or 50th percentile, of data grouped into class intervals
 51 centred on the data values provided. E.g. if your data points are rounded to
 52 the nearest whole number:
 53 
 54 >>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
 55 2.8333333333...
 56 
 57 This should be interpreted in this way: you have two data points in the class
 58 interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
 59 the class interval 3.5-4.5. The median of these data points is 2.8333...
 60 
 61 
 62 Calculating variability or spread
 63 ---------------------------------
 64 
 65 ==================  =============================================
 66 Function            Description
 67 ==================  =============================================
 68 pvariance           Population variance of data.
 69 variance            Sample variance of data.
 70 pstdev              Population standard deviation of data.
 71 stdev               Sample standard deviation of data.
 72 ==================  =============================================
 73 
 74 Calculate the standard deviation of sample data:
 75 
 76 >>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
 77 4.38961843444...
 78 
 79 If you have previously calculated the mean, you can pass it as the optional
 80 second argument to the four "spread" functions to avoid recalculating it:
 81 
 82 >>> data = [1, 2, 2, 4, 4, 4, 5, 6]
 83 >>> mu = mean(data)
 84 >>> pvariance(data, mu)
 85 2.5
 86 
 87 
 88 Exceptions
 89 ----------
 90 
 91 A single exception is defined: StatisticsError is a subclass of ValueError.
 92 
 93 """
 94 
 95 __all__ = [ 'StatisticsError',
 96             'pstdev', 'pvariance', 'stdev', 'variance',
 97             'median',  'median_low', 'median_high', 'median_grouped',
 98             'mean', 'mode',
 99           ]
100 
101 
102 import collections
103 import math
104 
105 from fractions import Fraction
106 from decimal import Decimal
107 
108 
109 # === Exceptions ===
110 
111 class StatisticsError(ValueError):
112     pass
113 
114 
115 # === Private utilities ===
116 
117 def _sum(data, start=0):
118     """_sum(data [, start]) -> value
119 
120     Return a high-precision sum of the given numeric data. If optional
121     argument ``start`` is given, it is added to the total. If ``data`` is
122     empty, ``start`` (defaulting to 0) is returned.
123 
124 
125     Examples
126     --------
127 
128     >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
129     11.0
130 
131     Some sources of round-off error will be avoided:
132 
133     >>> _sum([1e50, 1, -1e50] * 1000)  # Built-in sum returns zero.
134     1000.0
135 
136     Fractions and Decimals are also supported:
137 
138     >>> from fractions import Fraction as F
139     >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
140     Fraction(63, 20)
141 
142     >>> from decimal import Decimal as D
143     >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
144     >>> _sum(data)
145     Decimal('0.6963')
146 
147     Mixed types are currently treated as an error, except that int is
148     allowed.
149     """
150     # We fail as soon as we reach a value that is not an int or the type of
151     # the first value which is not an int. E.g. _sum([int, int, float, int])
152     # is okay, but sum([int, int, float, Fraction]) is not.
153     allowed_types = set([int, type(start)])
154     n, d = _exact_ratio(start)
155     partials = {d: n}  # map {denominator: sum of numerators}
156     # Micro-optimizations.
157     exact_ratio = _exact_ratio
158     partials_get = partials.get
159     # Add numerators for each denominator.
160     for x in data:
161         _check_type(type(x), allowed_types)
162         n, d = exact_ratio(x)
163         partials[d] = partials_get(d, 0) + n
164     # Find the expected result type. If allowed_types has only one item, it
165     # will be int; if it has two, use the one which isn't int.
166     assert len(allowed_types) in (1, 2)
167     if len(allowed_types) == 1:
168         assert allowed_types.pop() is int
169         T = int
170     else:
171         T = (allowed_types - set([int])).pop()
172     if None in partials:
173         assert issubclass(T, (float, Decimal))
174         assert not math.isfinite(partials[None])
175         return T(partials[None])
176     total = Fraction()
177     for d, n in sorted(partials.items()):
178         total += Fraction(n, d)
179     if issubclass(T, int):
180         assert total.denominator == 1
181         return T(total.numerator)
182     if issubclass(T, Decimal):
183         return T(total.numerator)/total.denominator
184     return T(total)
185 
186 
187 def _check_type(T, allowed):
188     if T not in allowed:
189         if len(allowed) == 1:
190             allowed.add(T)
191         else:
192             types = ', '.join([t.__name__ for t in allowed] + [T.__name__])
193             raise TypeError("unsupported mixed types: %s" % types)
194 
195 
196 def _exact_ratio(x):
197     """Convert Real number x exactly to (numerator, denominator) pair.
198 
199     >>> _exact_ratio(0.25)
200     (1, 4)
201 
202     x is expected to be an int, Fraction, Decimal or float.
203     """
204     try:
205         try:
206             # int, Fraction
207             return (x.numerator, x.denominator)
208         except AttributeError:
209             # float
210             try:
211                 return x.as_integer_ratio()
212             except AttributeError:
213                 # Decimal
214                 try:
215                     return _decimal_to_ratio(x)
216                 except AttributeError:
217                     msg = "can't convert type '{}' to numerator/denominator"
218                     raise TypeError(msg.format(type(x).__name__)) from None
219     except (OverflowError, ValueError):
220         # INF or NAN
221         if __debug__:
222             # Decimal signalling NANs cannot be converted to float :-(
223             if isinstance(x, Decimal):
224                 assert not x.is_finite()
225             else:
226                 assert not math.isfinite(x)
227         return (x, None)
228 
229 
230 # FIXME This is faster than Fraction.from_decimal, but still too slow.
231 def _decimal_to_ratio(d):
232     """Convert Decimal d to exact integer ratio (numerator, denominator).
233 
234     >>> from decimal import Decimal
235     >>> _decimal_to_ratio(Decimal("2.6"))
236     (26, 10)
237 
238     """
239     sign, digits, exp = d.as_tuple()
240     if exp in ('F', 'n', 'N'):  # INF, NAN, sNAN
241         assert not d.is_finite()
242         raise ValueError
243     num = 0
244     for digit in digits:
245         num = num*10 + digit
246     if exp < 0:
247         den = 10**-exp
248     else:
249         num *= 10**exp
250         den = 1
251     if sign:
252         num = -num
253     return (num, den)
254 
255 
256 def _counts(data):
257     # Generate a table of sorted (value, frequency) pairs.
258     table = collections.Counter(iter(data)).most_common()
259     if not table:
260         return table
261     # Extract the values with the highest frequency.
262     maxfreq = table[0][1]
263     for i in range(1, len(table)):
264         if table[i][1] != maxfreq:
265             table = table[:i]
266             break
267     return table
268 
269 
270 # === Measures of central tendency (averages) ===
271 
272 def mean(data):
273     """Return the sample arithmetic mean of data.
274 
275     >>> mean([1, 2, 3, 4, 4])
276     2.8
277 
278     >>> from fractions import Fraction as F
279     >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
280     Fraction(13, 21)
281 
282     >>> from decimal import Decimal as D
283     >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
284     Decimal('0.5625')
285 
286     If ``data`` is empty, StatisticsError will be raised.
287     """
288     if iter(data) is data:
289         data = list(data)
290     n = len(data)
291     if n < 1:
292         raise StatisticsError('mean requires at least one data point')
293     return _sum(data)/n
294 
295 
296 # FIXME: investigate ways to calculate medians without sorting? Quickselect?
297 def median(data):
298     """Return the median (middle value) of numeric data.
299 
300     When the number of data points is odd, return the middle data point.
301     When the number of data points is even, the median is interpolated by
302     taking the average of the two middle values:
303 
304     >>> median([1, 3, 5])
305     3
306     >>> median([1, 3, 5, 7])
307     4.0
308 
309     """
310     data = sorted(data)
311     n = len(data)
312     if n == 0:
313         raise StatisticsError("no median for empty data")
314     if n%2 == 1:
315         return data[n//2]
316     else:
317         i = n//2
318         return (data[i - 1] + data[i])/2
319 
320 
321 def median_low(data):
322     """Return the low median of numeric data.
323 
324     When the number of data points is odd, the middle value is returned.
325     When it is even, the smaller of the two middle values is returned.
326 
327     >>> median_low([1, 3, 5])
328     3
329     >>> median_low([1, 3, 5, 7])
330     3
331 
332     """
333     data = sorted(data)
334     n = len(data)
335     if n == 0:
336         raise StatisticsError("no median for empty data")
337     if n%2 == 1:
338         return data[n//2]
339     else:
340         return data[n//2 - 1]
341 
342 
343 def median_high(data):
344     """Return the high median of data.
345 
346     When the number of data points is odd, the middle value is returned.
347     When it is even, the larger of the two middle values is returned.
348 
349     >>> median_high([1, 3, 5])
350     3
351     >>> median_high([1, 3, 5, 7])
352     5
353 
354     """
355     data = sorted(data)
356     n = len(data)
357     if n == 0:
358         raise StatisticsError("no median for empty data")
359     return data[n//2]
360 
361 
362 def median_grouped(data, interval=1):
363     """"Return the 50th percentile (median) of grouped continuous data.
364 
365     >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
366     3.7
367     >>> median_grouped([52, 52, 53, 54])
368     52.5
369 
370     This calculates the median as the 50th percentile, and should be
371     used when your data is continuous and grouped. In the above example,
372     the values 1, 2, 3, etc. actually represent the midpoint of classes
373     0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
374     class 3.5-4.5, and interpolation is used to estimate it.
375 
376     Optional argument ``interval`` represents the class interval, and
377     defaults to 1. Changing the class interval naturally will change the
378     interpolated 50th percentile value:
379 
380     >>> median_grouped([1, 3, 3, 5, 7], interval=1)
381     3.25
382     >>> median_grouped([1, 3, 3, 5, 7], interval=2)
383     3.5
384 
385     This function does not check whether the data points are at least
386     ``interval`` apart.
387     """
388     data = sorted(data)
389     n = len(data)
390     if n == 0:
391         raise StatisticsError("no median for empty data")
392     elif n == 1:
393         return data[0]
394     # Find the value at the midpoint. Remember this corresponds to the
395     # centre of the class interval.
396     x = data[n//2]
397     for obj in (x, interval):
398         if isinstance(obj, (str, bytes)):
399             raise TypeError('expected number but got %r' % obj)
400     try:
401         L = x - interval/2  # The lower limit of the median interval.
402     except TypeError:
403         # Mixed type. For now we just coerce to float.
404         L = float(x) - float(interval)/2
405     cf = data.index(x)  # Number of values below the median interval.
406     # FIXME The following line could be more efficient for big lists.
407     f = data.count(x)  # Number of data points in the median interval.
408     return L + interval*(n/2 - cf)/f
409 
410 
411 def mode(data):
412     """Return the most common data point from discrete or nominal data.
413 
414     ``mode`` assumes discrete data, and returns a single value. This is the
415     standard treatment of the mode as commonly taught in schools:
416 
417     >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
418     3
419 
420     This also works with nominal (non-numeric) data:
421 
422     >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
423     'red'
424 
425     If there is not exactly one most common value, ``mode`` will raise
426     StatisticsError.
427     """
428     # Generate a table of sorted (value, frequency) pairs.
429     table = _counts(data)
430     if len(table) == 1:
431         return table[0][0]
432     elif table:
433         raise StatisticsError(
434                 'no unique mode; found %d equally common values' % len(table)
435                 )
436     else:
437         raise StatisticsError('no mode for empty data')
438 
439 
440 # === Measures of spread ===
441 
442 # See http://mathworld.wolfram.com/Variance.html
443 #     http://mathworld.wolfram.com/SampleVariance.html
444 #     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
445 #
446 # Under no circumstances use the so-called "computational formula for
447 # variance", as that is only suitable for hand calculations with a small
448 # amount of low-precision data. It has terrible numeric properties.
449 #
450 # See a comparison of three computational methods here:
451 # http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
452 
453 def _ss(data, c=None):
454     """Return sum of square deviations of sequence data.
455 
456     If ``c`` is None, the mean is calculated in one pass, and the deviations
457     from the mean are calculated in a second pass. Otherwise, deviations are
458     calculated from ``c`` as given. Use the second case with care, as it can
459     lead to garbage results.
460     """
461     if c is None:
462         c = mean(data)
463     ss = _sum((x-c)**2 for x in data)
464     # The following sum should mathematically equal zero, but due to rounding
465     # error may not.
466     ss -= _sum((x-c) for x in data)**2/len(data)
467     assert not ss < 0, 'negative sum of square deviations: %f' % ss
468     return ss
469 
470 
471 def variance(data, xbar=None):
472     """Return the sample variance of data.
473 
474     data should be an iterable of Real-valued numbers, with at least two
475     values. The optional argument xbar, if given, should be the mean of
476     the data. If it is missing or None, the mean is automatically calculated.
477 
478     Use this function when your data is a sample from a population. To
479     calculate the variance from the entire population, see ``pvariance``.
480 
481     Examples:
482 
483     >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
484     >>> variance(data)
485     1.3720238095238095
486 
487     If you have already calculated the mean of your data, you can pass it as
488     the optional second argument ``xbar`` to avoid recalculating it:
489 
490     >>> m = mean(data)
491     >>> variance(data, m)
492     1.3720238095238095
493 
494     This function does not check that ``xbar`` is actually the mean of
495     ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
496     impossible results.
497 
498     Decimals and Fractions are supported:
499 
500     >>> from decimal import Decimal as D
501     >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
502     Decimal('31.01875')
503 
504     >>> from fractions import Fraction as F
505     >>> variance([F(1, 6), F(1, 2), F(5, 3)])
506     Fraction(67, 108)
507 
508     """
509     if iter(data) is data:
510         data = list(data)
511     n = len(data)
512     if n < 2:
513         raise StatisticsError('variance requires at least two data points')
514     ss = _ss(data, xbar)
515     return ss/(n-1)
516 
517 
518 def pvariance(data, mu=None):
519     """Return the population variance of ``data``.
520 
521     data should be an iterable of Real-valued numbers, with at least one
522     value. The optional argument mu, if given, should be the mean of
523     the data. If it is missing or None, the mean is automatically calculated.
524 
525     Use this function to calculate the variance from the entire population.
526     To estimate the variance from a sample, the ``variance`` function is
527     usually a better choice.
528 
529     Examples:
530 
531     >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
532     >>> pvariance(data)
533     1.25
534 
535     If you have already calculated the mean of the data, you can pass it as
536     the optional second argument to avoid recalculating it:
537 
538     >>> mu = mean(data)
539     >>> pvariance(data, mu)
540     1.25
541 
542     This function does not check that ``mu`` is actually the mean of ``data``.
543     Giving arbitrary values for ``mu`` may lead to invalid or impossible
544     results.
545 
546     Decimals and Fractions are supported:
547 
548     >>> from decimal import Decimal as D
549     >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
550     Decimal('24.815')
551 
552     >>> from fractions import Fraction as F
553     >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
554     Fraction(13, 72)
555 
556     """
557     if iter(data) is data:
558         data = list(data)
559     n = len(data)
560     if n < 1:
561         raise StatisticsError('pvariance requires at least one data point')
562     ss = _ss(data, mu)
563     return ss/n
564 
565 
566 def stdev(data, xbar=None):
567     """Return the square root of the sample variance.
568 
569     See ``variance`` for arguments and other details.
570 
571     >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
572     1.0810874155219827
573 
574     """
575     var = variance(data, xbar)
576     try:
577         return var.sqrt()
578     except AttributeError:
579         return math.sqrt(var)
580 
581 
582 def pstdev(data, mu=None):
583     """Return the square root of the population variance.
584 
585     See ``pvariance`` for arguments and other details.
586 
587     >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
588     0.986893273527251
589 
590     """
591     var = pvariance(data, mu)
592     try:
593         return var.sqrt()
594     except AttributeError:
595         return math.sqrt(var)