Skip to content

Commit b30a1ff

Browse files
committedDec 1, 2016
[FEATURE][processing] New universal 'basic stats for field' algorithm
Replaces the existing 'Basic Stats for Numeric Fields' and 'Basic Stats for String Fields' algorithms and adds support for date/time/datetime fields. Having a single unified algorithm allows more flexible models where a field type may not be known in advance. Deprecate existing basic stats algorithms
1 parent 1ff165a commit b30a1ff

13 files changed

+383
-11
lines changed
 

‎python/plugins/processing/algs/help/qgis.yaml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,10 @@ qgis:advancedpythonfieldcalculator: >
2222

2323
qgis:barplot:
2424

25+
qgis:basicstatisticsforfields: >
26+
This algorithm generates basic statistics from the analysis of a values in a field in the attribute table of a vector layer. Numeric, date, time and string fields are supported.
2527

26-
qgis:basicstatisticsfornumericfields: >
27-
This algorithm generates basic statistics from the analysis of a numeric field in the attribute table of a vector layer.
28-
29-
Statistics are generated as an HTML file.
30-
31-
qgis:basicstatisticsfortextfields: >
32-
This algorithm generates basic statistics from the analysis of a text field in the attribute table of a vector layer.
28+
The statistics returned will depend on the field type.
3329

3430
Statistics are generated as an HTML file.
3531

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
***************************************************************************
5+
BasicStatistics.py
6+
---------------------
7+
Date : November 2016
8+
Copyright : (C) 2016 by Nyall Dawson
9+
Email : nyall dot dawson at gmail dot com
10+
***************************************************************************
11+
* *
12+
* This program is free software; you can redistribute it and/or modify *
13+
* it under the terms of the GNU General Public License as published by *
14+
* the Free Software Foundation; either version 2 of the License, or *
15+
* (at your option) any later version. *
16+
* *
17+
***************************************************************************
18+
"""
19+
20+
__author__ = 'Nyall Dawson'
21+
__date__ = 'November 2016'
22+
__copyright__ = '(C) 2016, Nyall Dawson'
23+
24+
# This will get replaced with a git SHA1 when you do a git archive
25+
26+
__revision__ = '$Format:%H$'
27+
28+
import os
29+
import codecs
30+
31+
from qgis.PyQt.QtCore import QVariant
32+
from qgis.PyQt.QtGui import QIcon
33+
34+
from qgis.core import (QgsStatisticalSummary,
35+
QgsStringStatisticalSummary,
36+
QgsDateTimeStatisticalSummary,
37+
QgsFeatureRequest)
38+
39+
from processing.core.GeoAlgorithm import GeoAlgorithm
40+
from processing.core.parameters import ParameterTable
41+
from processing.core.parameters import ParameterTableField
42+
from processing.core.outputs import OutputHTML
43+
from processing.core.outputs import OutputNumber
44+
from processing.tools import dataobjects, vector
45+
46+
47+
pluginPath = os.path.split(os.path.split(os.path.dirname(__file__))[0])[0]
48+
49+
50+
class BasicStatisticsForField(GeoAlgorithm):
51+
52+
INPUT_LAYER = 'INPUT_LAYER'
53+
FIELD_NAME = 'FIELD_NAME'
54+
OUTPUT_HTML_FILE = 'OUTPUT_HTML_FILE'
55+
56+
MIN = 'MIN'
57+
MAX = 'MAX'
58+
COUNT = 'COUNT'
59+
UNIQUE = 'UNIQUE'
60+
EMPTY = 'EMPTY'
61+
FILLED = 'FILLED'
62+
MIN_LENGTH = 'MIN_LENGTH'
63+
MAX_LENGTH = 'MAX_LENGTH'
64+
MEAN_LENGTH = 'MEAN_LENGTH'
65+
CV = 'CV'
66+
SUM = 'SUM'
67+
MEAN = 'MEAN'
68+
STD_DEV = 'STD_DEV'
69+
RANGE = 'RANGE'
70+
MEDIAN = 'MEDIAN'
71+
MINORITY = 'MINORITY'
72+
MAJORITY = 'MAJORITY'
73+
FIRSTQUARTILE = 'FIRSTQUARTILE'
74+
THIRDQUARTILE = 'THIRDQUARTILE'
75+
IQR = 'IQR'
76+
77+
def getIcon(self):
78+
return QIcon(os.path.join(pluginPath, 'images', 'ftools', 'basic_statistics.png'))
79+
80+
def defineCharacteristics(self):
81+
self.name, self.i18n_name = self.trAlgorithm('Basic statistics for fields')
82+
self.group, self.i18n_group = self.trAlgorithm('Vector table tools')
83+
self.tags = self.tr('stats,statistics,date,time,datetime,string,number,text,table,layer,maximum,minimum,mean,average,standard,deviation,'
84+
'count,distinct,unique,variance,median,quartile,range,majority,minority')
85+
86+
self.addParameter(ParameterTable(self.INPUT_LAYER,
87+
self.tr('Input table')))
88+
self.addParameter(ParameterTableField(self.FIELD_NAME,
89+
self.tr('Field to calculate statistics on'),
90+
self.INPUT_LAYER))
91+
92+
self.addOutput(OutputHTML(self.OUTPUT_HTML_FILE,
93+
self.tr('Statistics')))
94+
95+
self.addOutput(OutputNumber(self.COUNT, self.tr('Count')))
96+
self.addOutput(OutputNumber(self.UNIQUE, self.tr('Number of unique values')))
97+
self.addOutput(OutputNumber(self.EMPTY, self.tr('Number of empty (null) values')))
98+
self.addOutput(OutputNumber(self.FILLED, self.tr('Number of non-empty values')))
99+
self.addOutput(OutputNumber(self.MIN, self.tr('Minimum value')))
100+
self.addOutput(OutputNumber(self.MAX, self.tr('Maximum value')))
101+
self.addOutput(OutputNumber(self.MIN_LENGTH, self.tr('Minimum length')))
102+
self.addOutput(OutputNumber(self.MAX_LENGTH, self.tr('Maximum length')))
103+
self.addOutput(OutputNumber(self.MEAN_LENGTH, self.tr('Mean length')))
104+
self.addOutput(OutputNumber(self.CV, self.tr('Coefficient of Variation')))
105+
self.addOutput(OutputNumber(self.SUM, self.tr('Sum')))
106+
self.addOutput(OutputNumber(self.MEAN, self.tr('Mean value')))
107+
self.addOutput(OutputNumber(self.STD_DEV, self.tr('Standard deviation')))
108+
self.addOutput(OutputNumber(self.RANGE, self.tr('Range')))
109+
self.addOutput(OutputNumber(self.MEDIAN, self.tr('Median')))
110+
self.addOutput(OutputNumber(self.MINORITY, self.tr('Minority (rarest occurring value)')))
111+
self.addOutput(OutputNumber(self.MAJORITY, self.tr('Majority (most frequently occurring value)')))
112+
self.addOutput(OutputNumber(self.FIRSTQUARTILE, self.tr('First quartile')))
113+
self.addOutput(OutputNumber(self.THIRDQUARTILE, self.tr('Third quartile')))
114+
self.addOutput(OutputNumber(self.IQR, self.tr('Interquartile Range (IQR)')))
115+
116+
def processAlgorithm(self, progress):
117+
layer = dataobjects.getObjectFromUri(
118+
self.getParameterValue(self.INPUT_LAYER))
119+
field_name = self.getParameterValue(self.FIELD_NAME)
120+
field = layer.fields().at(layer.fields().lookupField(field_name))
121+
122+
output_file = self.getOutputValue(self.OUTPUT_HTML_FILE)
123+
124+
request = QgsFeatureRequest().setFlags(QgsFeatureRequest.NoGeometry).setSubsetOfAttributes([field_name], layer.fields())
125+
features = vector.features(layer, request)
126+
127+
data = []
128+
data.append(self.tr('Analyzed layer: {}').format(layer.name()))
129+
data.append(self.tr('Analyzed field: {}').format(field_name))
130+
131+
if field.isNumeric():
132+
data.extend(self.calcNumericStats(features, progress, field))
133+
elif field.type() in (QVariant.Date, QVariant.Time, QVariant.DateTime):
134+
data.extend(self.calcDateTimeStats(features, progress, field))
135+
else:
136+
data.extend(self.calcStringStats(features, progress, field))
137+
138+
self.createHTML(output_file, data)
139+
140+
def calcNumericStats(self, features, progress, field):
141+
count = len(features)
142+
total = 100.0 / float(count)
143+
stat = QgsStatisticalSummary()
144+
for current, ft in enumerate(features):
145+
stat.addVariant(ft[field.name()])
146+
progress.setPercentage(int(current * total))
147+
stat.finalize()
148+
149+
cv = stat.stDev() / stat.mean() if stat.mean() != 0 else 0
150+
151+
self.setOutputValue(self.COUNT, stat.count())
152+
self.setOutputValue(self.UNIQUE, stat.variety())
153+
self.setOutputValue(self.EMPTY, stat.countMissing())
154+
self.setOutputValue(self.FILLED, count - stat.countMissing())
155+
self.setOutputValue(self.MIN, stat.min())
156+
self.setOutputValue(self.MAX, stat.max())
157+
self.setOutputValue(self.RANGE, stat.range())
158+
self.setOutputValue(self.SUM, stat.sum())
159+
self.setOutputValue(self.MEAN, stat.mean())
160+
self.setOutputValue(self.MEDIAN, stat.median())
161+
self.setOutputValue(self.STD_DEV, stat.stDev())
162+
self.setOutputValue(self.CV, cv)
163+
self.setOutputValue(self.MINORITY, stat.minority())
164+
self.setOutputValue(self.MAJORITY, stat.majority())
165+
self.setOutputValue(self.FIRSTQUARTILE, stat.firstQuartile())
166+
self.setOutputValue(self.THIRDQUARTILE, stat.thirdQuartile())
167+
self.setOutputValue(self.IQR, stat.interQuartileRange())
168+
169+
data = []
170+
data.append(self.tr('Count: {}').format(stat.count()))
171+
data.append(self.tr('Unique values: {}').format(stat.variety()))
172+
data.append(self.tr('NULL (missing) values: {}').format(stat.countMissing()))
173+
data.append(self.tr('Minimum value: {}').format(stat.min()))
174+
data.append(self.tr('Maximum value: {}').format(stat.max()))
175+
data.append(self.tr('Range: {}').format(stat.range()))
176+
data.append(self.tr('Sum: {}').format(stat.sum()))
177+
data.append(self.tr('Mean value: {}').format(stat.mean()))
178+
data.append(self.tr('Median value: {}').format(stat.median()))
179+
data.append(self.tr('Standard deviation: {}').format(stat.stDev()))
180+
data.append(self.tr('Coefficient of Variation: {}').format(cv))
181+
data.append(self.tr('Minority (rarest occurring value): {}').format(stat.minority()))
182+
data.append(self.tr('Majority (most frequently occurring value): {}').format(stat.majority()))
183+
data.append(self.tr('First quartile: {}').format(stat.firstQuartile()))
184+
data.append(self.tr('Third quartile: {}').format(stat.thirdQuartile()))
185+
data.append(self.tr('Interquartile Range (IQR): {}').format(stat.interQuartileRange()))
186+
return data
187+
188+
def calcStringStats(self, features, progress, field):
189+
count = len(features)
190+
total = 100.0 / float(count)
191+
stat = QgsStringStatisticalSummary()
192+
for current, ft in enumerate(features):
193+
stat.addValue(ft[field.name()])
194+
progress.setPercentage(int(current * total))
195+
stat.finalize()
196+
197+
self.setOutputValue(self.COUNT, stat.count())
198+
self.setOutputValue(self.UNIQUE, stat.countDistinct())
199+
self.setOutputValue(self.EMPTY, stat.countMissing())
200+
self.setOutputValue(self.FILLED, stat.count() - stat.countMissing())
201+
self.setOutputValue(self.MIN, stat.min())
202+
self.setOutputValue(self.MAX, stat.max())
203+
self.setOutputValue(self.MIN_LENGTH, stat.minLength())
204+
self.setOutputValue(self.MAX_LENGTH, stat.maxLength())
205+
self.setOutputValue(self.MEAN_LENGTH, stat.meanLength())
206+
207+
data = []
208+
data.append(self.tr('Count: {}').format(count))
209+
data.append(self.tr('Unique values: {}').format(stat.countDistinct()))
210+
data.append(self.tr('NULL (missing) values: {}').format(stat.countMissing()))
211+
data.append(self.tr('Minimum value: {}').format(stat.min()))
212+
data.append(self.tr('Maximum value: {}').format(stat.max()))
213+
data.append(self.tr('Minimum length: {}').format(stat.minLength()))
214+
data.append(self.tr('Maximum length: {}').format(stat.maxLength()))
215+
data.append(self.tr('Mean length: {}').format(stat.meanLength()))
216+
217+
return data
218+
219+
def calcDateTimeStats(self, features, progress, field):
220+
count = len(features)
221+
total = 100.0 / float(count)
222+
stat = QgsDateTimeStatisticalSummary()
223+
for current, ft in enumerate(features):
224+
stat.addValue(ft[field.name()])
225+
progress.setPercentage(int(current * total))
226+
stat.finalize()
227+
228+
self.setOutputValue(self.COUNT, stat.count())
229+
self.setOutputValue(self.UNIQUE, stat.countDistinct())
230+
self.setOutputValue(self.EMPTY, stat.countMissing())
231+
self.setOutputValue(self.FILLED, stat.count() - stat.countMissing())
232+
self.setOutputValue(self.MIN, stat.statistic(QgsDateTimeStatisticalSummary.Min))
233+
self.setOutputValue(self.MAX, stat.statistic(QgsDateTimeStatisticalSummary.Max))
234+
235+
data = []
236+
data.append(self.tr('Count: {}').format(count))
237+
data.append(self.tr('Unique values: {}').format(stat.countDistinct()))
238+
data.append(self.tr('NULL (missing) values: {}').format(stat.countMissing()))
239+
data.append(self.tr('Minimum value: {}').format(field.displayString(stat.statistic(QgsDateTimeStatisticalSummary.Min))))
240+
data.append(self.tr('Maximum value: {}').format(field.displayString(stat.statistic(QgsDateTimeStatisticalSummary.Max))))
241+
242+
return data
243+
244+
def createHTML(self, outputFile, algData):
245+
with codecs.open(outputFile, 'w', encoding='utf-8') as f:
246+
f.write('<html><head>\n')
247+
f.write('<meta http-equiv="Content-Type" content="text/html; \
248+
charset=utf-8" /></head><body>\n')
249+
for s in algData:
250+
f.write('<p>' + str(s) + '</p>\n')
251+
f.write('</body></html>\n')

‎python/plugins/processing/algs/qgis/BasicStatisticsNumbers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,18 @@ class BasicStatisticsNumbers(GeoAlgorithm):
6868
NULLVALUES = 'NULLVALUES'
6969
IQR = 'IQR'
7070

71+
def __init__(self):
72+
GeoAlgorithm.__init__(self)
73+
# this algorithm is deprecated - use BasicStatistics instead
74+
self.showInToolbox = False
75+
7176
def getIcon(self):
7277
return QIcon(os.path.join(pluginPath, 'images', 'ftools', 'basic_statistics.png'))
7378

7479
def defineCharacteristics(self):
7580
self.name, self.i18n_name = self.trAlgorithm('Basic statistics for numeric fields')
7681
self.group, self.i18n_group = self.trAlgorithm('Vector table tools')
82+
self.tags = self.tr('stats,statistics,number,table,layer')
7783

7884
self.addParameter(ParameterTable(self.INPUT_LAYER,
7985
self.tr('Input vector layer')))

‎python/plugins/processing/algs/qgis/BasicStatisticsStrings.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,18 @@ class BasicStatisticsStrings(GeoAlgorithm):
6060
MIN_VALUE = 'MIN_VALUE'
6161
MAX_VALUE = 'MAX_VALUE'
6262

63+
def __init__(self):
64+
GeoAlgorithm.__init__(self)
65+
# this algorithm is deprecated - use BasicStatistics instead
66+
self.showInToolbox = False
67+
6368
def getIcon(self):
6469
return QIcon(os.path.join(pluginPath, 'images', 'ftools', 'basic_statistics.png'))
6570

6671
def defineCharacteristics(self):
6772
self.name, self.i18n_name = self.trAlgorithm('Basic statistics for text fields')
6873
self.group, self.i18n_group = self.trAlgorithm('Vector table tools')
74+
self.tags = self.tr('stats,statistics,string,table,layer')
6975

7076
self.addParameter(ParameterTable(self.INPUT_LAYER,
7177
self.tr('Input vector layer')))

‎python/plugins/processing/algs/qgis/QGISAlgorithmProvider.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@
180180
from .PoleOfInaccessibility import PoleOfInaccessibility
181181
from .CreateAttributeIndex import CreateAttributeIndex
182182
from .DropGeometry import DropGeometry
183+
from .BasicStatistics import BasicStatisticsForField
183184

184185
pluginPath = os.path.normpath(os.path.join(
185186
os.path.split(os.path.dirname(__file__))[0], os.pardir))
@@ -243,7 +244,8 @@ def __init__(self):
243244
TinInterpolationZValue(), TinInterpolationAttribute(),
244245
RemoveNullGeometry(), ExtractByExpression(), ExtendLines(),
245246
ExtractSpecificNodes(), GeometryByExpression(), SnapGeometriesToLayer(),
246-
PoleOfInaccessibility(), CreateAttributeIndex(), DropGeometry()
247+
PoleOfInaccessibility(), CreateAttributeIndex(), DropGeometry(),
248+
BasicStatisticsForField()
247249
]
248250

249251
if hasMatplotlib:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
!table
2+
!version 900
3+
!charset Neutral
4+
5+
Definition Table
6+
Type NATIVE Charset "Neutral"
7+
Fields 3
8+
date Date ;
9+
time Time ;
10+
date_time DateTime ;
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<html><head>
2+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
3+
<p>Analyzed layer: custom/datetimes.tab</p>
4+
<p>Analyzed field: date</p>
5+
<p>Count: 4</p>
6+
<p>Unique values: 3</p>
7+
<p>NULL (missing) values: 1</p>
8+
<p>Minimum value: 2014-11-30T00:00:00</p>
9+
<p>Maximum value: 2016-11-30T00:00:00</p>
10+
</body></html>
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<html><head>
2+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
3+
<p>Analyzed layer: custom/datetimes.tab</p>
4+
<p>Analyzed field: date_time</p>
5+
<p>Count: 4</p>
6+
<p>Unique values: 3</p>
7+
<p>NULL (missing) values: 1</p>
8+
<p>Minimum value: 2014-11-30T14:30:02</p>
9+
<p>Maximum value: 2016-11-30T14:29:22</p>
10+
</body></html>
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<html><head>
2+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
3+
<p>Analyzed layer: custom/datetimes.tab</p>
4+
<p>Analyzed field: time</p>
5+
<p>Count: 4</p>
6+
<p>Unique values: 3</p>
7+
<p>NULL (missing) values: 1</p>
8+
<p>Minimum value: 03:29:40</p>
9+
<p>Maximum value: 15:29:22</p>
10+
</body></html>

‎python/plugins/processing/tests/testdata/qgis_algorithm_tests.yaml

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ tests:
152152
fields:
153153
fid: skip
154154

155-
- algorithm: qgis:basicstatisticsfornumericfields
155+
- algorithm: qgis:basicstatisticsforfields
156156
name: Basic statistics for numeric fields
157157
params:
158158
- name: multipolys.gml
@@ -182,7 +182,7 @@ tests:
182182
- 'NULL \(missing\) values: 1'
183183
- 'Interquartile Range \(IQR\): 0.123'
184184

185-
- algorithm: qgis:basicstatisticsfortextfields
185+
- algorithm: qgis:basicstatisticsforfields
186186
name: Basic statistics for text fields
187187
params:
188188
- name: multipolys.gml
@@ -191,7 +191,18 @@ tests:
191191
results:
192192
OUTPUT_HTML_FILE:
193193
name: expected/basic_statistics_string.html
194-
type: file
194+
type: regex
195+
rules:
196+
- 'Analyzed layer: multipolys.gml'
197+
- 'Analyzed field: Bname'
198+
- 'Count: 4'
199+
- 'Unique values: 2'
200+
- 'Minimum value: Test'
201+
- 'Maximum value: Test'
202+
- 'Minimum length: 0'
203+
- 'Maximum length: 4'
204+
- 'Mean length: 3.0'
205+
- 'NULL \(missing\) values: 1'
195206

196207
# Split lines with lines considers two cases
197208
# case 1: two different layers
@@ -1753,3 +1764,63 @@ tests:
17531764
OUTPUT:
17541765
name: expected/removed_holes_min_area.gml
17551766
type: vector
1767+
1768+
- algorithm: qgis:basicstatisticsforfields
1769+
name: Basic stats datetime
1770+
params:
1771+
FIELD_NAME: date_time
1772+
INPUT_LAYER:
1773+
name: custom/datetimes.tab
1774+
type: table
1775+
results:
1776+
OUTPUT_HTML_FILE:
1777+
name: expected/basic_statistics_datetime.html
1778+
type: regex
1779+
rules:
1780+
- 'Analyzed layer: custom/datetimes.tab'
1781+
- 'Analyzed field: date_time'
1782+
- 'Count: 4'
1783+
- 'Unique values: 3'
1784+
- 'Minimum value: 2014-11-30T14:30:02'
1785+
- 'Maximum value: 2016-11-30T14:29:22'
1786+
- 'NULL \(missing\) values: 1'
1787+
1788+
- algorithm: qgis:basicstatisticsforfields
1789+
name: Basic stats date
1790+
params:
1791+
FIELD_NAME: date
1792+
INPUT_LAYER:
1793+
name: custom/datetimes.tab
1794+
type: table
1795+
results:
1796+
OUTPUT_HTML_FILE:
1797+
name: expected/basic_statistics_date.html
1798+
type: regex
1799+
rules:
1800+
- 'Analyzed layer: custom/datetimes.tab'
1801+
- 'Analyzed field: date'
1802+
- 'Count: 4'
1803+
- 'Unique values: 3'
1804+
- 'Minimum value: 2014-11-30T00:00:00'
1805+
- 'Maximum value: 2016-11-30T00:00:00'
1806+
- 'NULL \(missing\) values: 1'
1807+
1808+
- algorithm: qgis:basicstatisticsforfields
1809+
name: Basic stats time
1810+
params:
1811+
FIELD_NAME: time
1812+
INPUT_LAYER:
1813+
name: custom/datetimes.tab
1814+
type: table
1815+
results:
1816+
OUTPUT_HTML_FILE:
1817+
name: expected/basic_statistics_time.html
1818+
type: regex
1819+
rules:
1820+
- 'Analyzed layer: custom/datetimes.tab'
1821+
- 'Analyzed field: time'
1822+
- 'Count: 4'
1823+
- 'Unique values: 3'
1824+
- 'Minimum value: 03:29:40'
1825+
- 'Maximum value: 15:29:22'
1826+
- 'NULL \(missing\) values: 1'

0 commit comments

Comments
 (0)
Please sign in to comment.