server/site_crashcollect.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import logging
import os
import re
import shutil
from autotest_lib.client.common_lib import utils as client_utils
from autotest_lib.client.common_lib.cros import dev_server
from autotest_lib.client.common_lib.cros import retry
from autotest_lib.client.cros import constants
from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY
from autotest_lib.server.crashcollect import collect_log_file
from autotest_lib.server import utils

try:
    from autotest_lib.utils.frozen_chromite.lib import metrics
except ImportError:
    metrics = client_utils.metrics_mock


def generate_minidump_stacktrace(minidump_path):
    """
    Generates a stacktrace for the specified minidump.

    This function expects the debug symbols to reside under:
        /build/<board>/usr/lib/debug

    @param minidump_path: absolute path to minidump to by symbolicated.
    @raise client_utils.error.CmdError if minidump_stackwalk return code != 0.
    """
    symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir()
    logging.info('symbol_dir: %s', symbol_dir)
    client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' %
                     (minidump_path, symbol_dir, minidump_path))


def _resolve_crashserver():
    """
    Attempts to find a devserver / crashserver that has capacity to
    symbolicate a crashdump.

    @raises DevServerException if no server with capacity could be found.
    @returns Hostname of resolved server, if found.
    """
    crashserver_name = dev_server.get_least_loaded_devserver(
            devserver_type=dev_server.CrashServer)
    if not crashserver_name:
        metrics.Counter('chromeos/autotest/crashcollect/could_not_resolve'
                        ).increment()
        raise dev_server.DevServerException(
                'No crash server has the capacity to symbolicate the dump.')
    else:
        metrics.Counter('chromeos/autotest/crashcollect/resolved'
                        ).increment(fields={'crash_server': crashserver_name})
    return crashserver_name


def _symbolicate_minidump_with_devserver(minidump_path, resultdir,
                                        crashserver_name):
    """
    Generates a stack trace for the specified minidump by consulting devserver.

    This function assumes the debug symbols have been staged on the devserver.

    @param minidump_path: absolute path to minidump to by symbolicated.
    @param resultdir: server job's result directory.
    @param crashserver_name: Name of crashserver to attempt to symbolicate with.
    @raise DevServerException upon failure, HTTP or otherwise.
    """
    # First, look up what build we tested.  If we can't find this, we can't
    # get the right debug symbols, so we might as well give up right now.
    keyvals = client_utils.read_keyval(resultdir)
    if JOB_BUILD_KEY not in keyvals:
        raise dev_server.DevServerException(
            'Cannot determine build being tested.')

    devserver = dev_server.CrashServer(crashserver_name)

    with metrics.SecondsTimer(
            'chromeos/autotest/crashcollect/symbolicate_duration',
            fields={'crash_server': crashserver_name}):
        trace_text = devserver.symbolicate_dump(minidump_path,
                                                keyvals[JOB_BUILD_KEY])

    if not trace_text:
        raise dev_server.DevServerException('Unknown error!!')
    with open(minidump_path + '.txt', 'w') as trace_file:
        trace_file.write(trace_text)

def generate_stacktrace_for_file(minidump, host_resultdir):
    """
    Tries to generate a stack trace for the file located at |minidump|.
    @param minidump: path to minidump file to generate the stacktrace for.
    @param host_resultdir: server job's result directory.
    """
    # First, try to symbolicate locally.
    try:
        logging.info('Trying to generate stack trace locally for %s', minidump)
        generate_minidump_stacktrace(minidump)
        logging.info('Generated stack trace for dump %s', minidump)
        return
    except client_utils.error.CmdError as err:
        logging.info('Failed to generate stack trace locally for '
                     'dump %s (rc=%d):\n%r',
                     minidump, err.result_obj.exit_status, err)

    # If that did not succeed, try to symbolicate using the dev server.
    try:
        logging.info('Generating stack trace using devserver for %s', minidump)
        crashserver_name = _resolve_crashserver()
        args = (minidump, host_resultdir, crashserver_name)
        is_timeout, _ = retry.timeout(_symbolicate_minidump_with_devserver,
                                      args=args,
                                      timeout_sec=600)
        if is_timeout:
            logging.info('Generating stack trace timed out for dump %s',
                         minidump)
            metrics.Counter(
                    'chromeos/autotest/crashcollect/symbolicate_timed_out'
            ).increment(fields={'crash_server': crashserver_name})
        else:
            logging.info('Generated stack trace for dump %s', minidump)
            return
    except dev_server.DevServerException as e:
        logging.info('Failed to generate stack trace on devserver for dump '
                     '%s:\n%r', minidump, e)

    # Symbolicating failed.
    logging.warning('Failed to generate stack trace for %s (see info logs)',
                    minidump)

def find_and_generate_minidump_stacktraces(host_resultdir):
    """
    Finds all minidump files and generates a stack trace for each.

    Enumerates all files under the test results directory (recursively)
    and generates a stack trace file for the minidumps.  Minidump files are
    identified as files with .dmp extension.  The stack trace filename is
    composed by appending the .txt extension to the minidump filename.

    @param host_resultdir: Directory to walk looking for dmp files.

    @returns The list of all found minidump files. Each dump may or may not have
             been symbolized.
    """
    minidumps = []
    for file in _find_crashdumps(host_resultdir):
        generate_stacktrace_for_file(file, host_resultdir)
        minidumps.append(file)
    return minidumps


def _find_crashdumps(host_resultdir):
    """Find crashdumps.

    @param host_resultdir The result directory for this host for this test run.
    """
    for dir, subdirs, files in os.walk(host_resultdir):
        for file in files:
            if file.endswith('.dmp'):
                yield os.path.join(dir, file)


def _find_orphaned_crashdumps(host):
    """Return file paths of crashdumps on host.

    @param host A host object of the device.
    """
    return host.list_files_glob(os.path.join(constants.CRASH_DIR, '*'))


def report_crashdumps(host):
    """Report on crashdumps for host.

    This is run when no tests failed.  We don't process crashdumps in this
    case because of devserver load, but they should still be reported.

    @param host A host object of the device we're to pull crashes from.
    """
    for crashfile in _find_orphaned_crashdumps(host):
        logging.warning('Host crashdump exists: %s', crashfile)
        host.job.record('INFO', None, None,
                        'Host crashdump exists: %s' % (crashfile,))

    host_resultdir = _get_host_resultdir(host)
    for crashfile in _find_crashdumps(host_resultdir):
        logging.warning('Local crashdump exists: %s', crashfile)
        host.job.record('INFO', None, None,
                        'Local crashdump exists: %s' % (crashfile,))


def fetch_orphaned_crashdumps(host, infodir):
    """
    Copy all of the crashes in the crash directory over to the results folder.

    @param host A host object of the device we're to pull crashes from.
    @param infodir The directory to fetch crashdumps into.
    @return The list of minidumps that we pulled back from the host.
    """
    if not os.path.exists(infodir):
        os.mkdir(infodir)
    orphans = []

    if not host.check_cached_up_status():
        logging.warning('Host %s did not answer to ping, skip fetching '
                        'orphaned crashdumps.', host.hostname)
        return orphans

    try:
        for file in _find_orphaned_crashdumps(host):
            logging.info('Collecting %s...', file)
            collect_log_file(host, file, infodir, clean=False)
            orphans.append(file)
    except Exception as e:
        logging.warning('Collection of orphaned crash dumps failed %s', e)
    finally:
        # Delete infodir if we have no orphans
        if not orphans:
            logging.info('There are no orphaned crashes; deleting %s', infodir)
            os.rmdir(infodir)
    return orphans


def _copy_to_debug_dir(host_resultdir, filename):
    """
    Copies a file to the debug dir under host_resultdir.

    @param host_resultdir The result directory for this host for this test run.
    @param filename The full path of the file to copy to the debug folder.
    """
    debugdir = os.path.join(host_resultdir, 'debug')
    src = filename
    dst = os.path.join(debugdir, os.path.basename(filename))

    try:
        shutil.copyfile(src, dst)
        logging.info('Copied %s to %s', src, dst)
    except IOError:
        logging.warning('Failed to copy %s to %s', src, dst)


def _get_host_resultdir(host):
    """Get resultdir for host.

    @param host A host object of the device we're to pull crashes from.
    """
    return getattr(getattr(host, 'job', None), 'resultdir', None)


def get_host_infodir(host):
    """Get infodir for host.

    @param host A host object of the device we're to pull crashes from.
    """
    host_resultdir = _get_host_resultdir(host)
    return os.path.join(host_resultdir, 'crashinfo.%s' % host.hostname)


def get_site_crashdumps(host, test_start_time):
    """
    Copy all of the crashdumps from a host to the results directory.

    @param host The host object from which to pull crashes
    @param test_start_time When the test we just ran started.
    @return A list of all the minidumps
    """
    host_resultdir = _get_host_resultdir(host)
    infodir = get_host_infodir(host)

    orphans = fetch_orphaned_crashdumps(host, infodir)
    minidumps = find_and_generate_minidump_stacktraces(host_resultdir)

    # Record all crashdumps in status.log of the job:
    # - If one server job runs several client jobs we will only record
    # crashdumps in the status.log of the high level server job.
    # - We will record these crashdumps whether or not we successfully
    # symbolicate them.
    if host.job and minidumps or orphans:
        host.job.record('INFO', None, None, 'Start crashcollection record')
        for minidump in minidumps:
            host.job.record('INFO', None, 'New Crash Dump', minidump)
        for orphan in orphans:
            host.job.record('INFO', None, 'Orphaned Crash Dump', orphan)
        host.job.record('INFO', None, None, 'End crashcollection record')

    orphans.extend(minidumps)

    for minidump in orphans:
        report_bug_from_crash(host, minidump)

    # We copy Chrome crash information to the debug dir to assist debugging.
    # Since orphans occurred on a previous run, they are most likely not
    # relevant to the current failure, so we don't copy them.
    for minidump in minidumps:
        minidump_no_ext = os.path.splitext(minidump)[0]
        _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.dmp.txt')
        _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.log')

    return orphans


def find_package_of(host, exec_name):
    """
    Find the package that an executable came from.

    @param host A host object that has the executable.
    @param exec_name Name of or path to executable.
    @return The name of the package that installed the executable.
    """
    # Run "portageq owners" on "host" to determine which package owns
    # "exec_name."  Portageq queue output consists of package names followed
    # tab-prefixed path names.  For example, owners of "python:"
    #
    # sys-devel/gdb-7.7.1-r2
    #         /usr/share/gdb/python
    # chromeos-base/dev-install-0.0.1-r711
    #         /usr/bin/python
    # dev-lang/python-2.7.3-r7
    #         /etc/env.d/python
    #
    # This gets piped into "xargs stat" to annotate each line with
    # information about the path, so we later can consider only packages
    # with executable files.  After annotation the above looks like:
    #
    # stat: cannot stat '@@@ sys-devel/gdb-7.7.1-r2 @@@': ...
    # stat: cannot stat '/usr/share/gdb/python': ...
    # stat: cannot stat '@@@ chromeos-base/dev-install-0.0.1-r711 @@@': ...
    # 755 -rwxr-xr-x /usr/bin/python
    # stat: cannot stat '@@@ dev-lang/python-2.7.3-r7 @@@': ...
    # 755 drwxr-xr-x /etc/env.d/python
    #
    # Package names are surrounded by "@@@" to facilitate parsing.  Lines
    # starting with an octal number were successfully annotated, because
    # the path existed on "host."
    # The above is then parsed to find packages which contain executable files
    # (not directories), in this case "chromeos-base/dev-install-0.0.1-r711."
    #
    # TODO(milleral): portageq can show scary looking error messages
    # in the debug logs via stderr. We only look at stdout, so those
    # get filtered, but it would be good to silence them.
    cmd = ('portageq owners / ' + exec_name +
            r'| sed -e "s/^[^\t].*/@@@ & @@@/" -e "s/^\t//"'
            r'| tr \\n \\0'
            ' | xargs -0 -r stat -L -c "%a %A %n" 2>&1')
    portageq = host.run(cmd, ignore_status=True)

    # Parse into a set of names of packages containing an executable file.
    packages = set()
    pkg = ''
    pkg_re = re.compile('@@@ (.*) @@@')
    path_re = re.compile('^([0-7]{3,}) (.)')
    for line in portageq.stdout.splitlines():
        match = pkg_re.search(line)
        if match:
            pkg = match.group(1)
            continue
        match = path_re.match(line)
        if match:
            isexec = int(match.group(1), 8) & 0o111
            isfile = match.group(2) == '-'
            if pkg and isexec and isfile:
                packages.add(pkg)

    # If exactly one package found it must be the one we want, return it.
    if len(packages) == 1:
        return packages.pop()

    # TODO(milleral): Decide if it really is an error if not exactly one
    # package is found.
    # It is highly questionable as to if this should be left in the
    # production version of this code or not.
    if len(packages) == 0:
        logging.warning('find_package_of() found no packages for "%s"',
                        exec_name)
    else:
        logging.warning('find_package_of() found multiple packages for "%s": '
                        '%s', exec_name, ', '.join(packages))
    return ''


def report_bug_from_crash(host, minidump_path):
    """
    Given a host to query and a minidump, file a bug about the crash.

    @param host A host object that is where the dump came from
    @param minidump_path The path to the dump file that should be reported.
    """
    # TODO(milleral): Once this has actually been tested, remove the
    # try/except. In the meantime, let's make sure nothing dies because of
    # the fact that this code isn't very heavily tested.
    try:
        meta_path = os.path.splitext(minidump_path)[0] + '.meta'
        with open(meta_path, 'r') as f:
            for line in f.readlines():
                parts = line.split('=')
                if parts[0] == 'exec_name':
                    package = find_package_of(host, parts[1].strip())
                    if not package:
                        package = '<unknown package>'
                    logging.info('Would report crash on %s.', package)
                    break
    except Exception as e:
        logging.warning('Crash detection failed with: %s', e)