aboutsummaryrefslogtreecommitdiff
path: root/tools/stub_diff_analyzer.py
blob: e49d0923117b6592322e0389393d7f8bd34ca654 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#!/usr/bin/env python
#
# Copyright (C) 2022 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sys import exit
from typing import List
from glob import glob
from pathlib import Path
from collections import defaultdict
from difflib import Differ
from re import split
from tqdm import tqdm
import argparse


DIFFER_CODE_LEN = 2

class DifferCodes:
    COMMON = '  '
    UNIQUE_FIRST = '- '
    UNIQUE_SECOND = '+ '
    DIFF_IDENT = '? '

class FilesDiffAnalyzer:
    def __init__(self, args) -> None:
        self.out_dir = args.out_dir
        self.show_diff = args.show_diff
        self.skip_words = args.skip_words
        self.first_dir = args.first_dir
        self.second_dir = args.second_dir
        self.include_common = args.include_common

        self.first_dir_files = self.get_files(self.first_dir)
        self.second_dir_files = self.get_files(self.second_dir)
        self.common_file_map = defaultdict(set)

        self.map_common_files(self.first_dir_files, self.first_dir)
        self.map_common_files(self.second_dir_files, self.second_dir)

    def get_files(self, dir: str) -> List[str]:
        """Get all files directory in the input directory including the files in the subdirectories

        Recursively finds all files in the input directory.
        Returns a list of file directory strings, which do not include directories but only files.
        List is sorted in alphabetical order of the file directories.

        Args:
            dir: Directory to get the files. String.

        Returns:
            A list of file directory strings within the input directory.
            Sorted in Alphabetical order.

        Raises:
            FileNotFoundError: An error occurred accessing the non-existing directory
        """

        if not dir_exists(dir):
            raise FileNotFoundError("Directory does not exist")

        if dir[:-2] != "**":
            if dir[:-1] != "/":
                dir += "/"
            dir += "**"

        return [file for file in sorted(glob(dir, recursive=True)) if Path(file).is_file()]

    def map_common_files(self, files: List[str], dir: str) -> None:
        for file in files:
            file_name = file.split(dir, 1)[-1]
            self.common_file_map[file_name].add(dir)
        return

    def compare_file_contents(self, first_file: str, second_file: str) -> List[str]:
        """Compare the contents of the files and return different lines

        Given two file directory strings, compare the contents of the two files
        and return the list of file contents string prepended with unique identifier codes.
        The identifier codes include:
        - '  '(two empty space characters): Line common to two files
        - '- '(minus followed by a space) : Line unique to first file
        - '+ '(plus followed by a space)  : Line unique to second file

        Args:
            first_file: First file directory string to compare the content
            second_file: Second file directory string to compare the content

        Returns:
            A list of the file content strings. For example:

            [
                "  Foo",
                "- Bar",
                "+ Baz"
            ]
        """

        d = Differ()
        first_file_contents = sort_methods(get_file_contents(first_file))
        second_file_contents = sort_methods(get_file_contents(second_file))
        diff = list(d.compare(first_file_contents, second_file_contents))
        ret = [f"diff {first_file} {second_file}"]

        idx = 0
        while idx < len(diff):
            line = diff[idx]
            line_code = line[:DIFFER_CODE_LEN]

            match line_code:
                case DifferCodes.COMMON:
                    if self.include_common:
                        ret.append(line)

                case DifferCodes.UNIQUE_FIRST:
                    # Should compare line
                    if (idx < len(diff) - 1 and
                        (next_line_code := diff[idx + 1][:DIFFER_CODE_LEN])
                        not in (DifferCodes.UNIQUE_FIRST, DifferCodes.COMMON)):
                        delta = 1 if next_line_code == DifferCodes.UNIQUE_SECOND else 2
                        line_to_compare = diff[idx + delta]
                        if self.lines_differ(line, line_to_compare):
                            ret.extend([line, line_to_compare])
                        else:
                            if self.include_common:
                                ret.append(DifferCodes.COMMON +
                                           line[DIFFER_CODE_LEN:])
                        idx += delta
                    else:
                        ret.append(line)

                case DifferCodes.UNIQUE_SECOND:
                    ret.append(line)

                case DifferCodes.DIFF_IDENT:
                    pass
            idx += 1
        return ret

    def lines_differ(self, line1: str, line2: str) -> bool:
        """Check if the input lines are different or not

        Compare the two lines word by word and check if the two lines are different or not.
        If the different words in the comparing lines are included in skip_words,
        the lines are not considered different.

        Args:
            line1:      first line to compare
            line2:      second line to compare

        Returns:
            Boolean value indicating if the two lines are different or not

        """
        # Split by '.' or ' '(whitespace)
        def split_words(line: str) -> List[str]:
            return split('\\s|\\.', line[DIFFER_CODE_LEN:])

        line1_words, line2_words = split_words(line1), split_words(line2)
        if len(line1_words) != len(line2_words):
            return True

        for word1, word2 in zip(line1_words, line2_words):
            if word1 != word2:
                # not check if words are equal to skip word, but
                # check if words contain skip word as substring
                if all(sw not in word1 and sw not in word2 for sw in self.skip_words):
                    return True

        return False

    def analyze(self) -> None:
        """Analyze file contents in both directories and write to output or console.
        """
        for file in tqdm(sorted(self.common_file_map.keys())):
            val = self.common_file_map[file]

            # When file exists in both directories
            lines = list()
            if val == set([self.first_dir, self.second_dir]):
                lines = self.compare_file_contents(
                    self.first_dir + file, self.second_dir + file)
            else:
                existing_dir, not_existing_dir = (
                    (self.first_dir, self.second_dir) if self.first_dir in val
                    else (self.second_dir, self.first_dir))

                lines = [f"{not_existing_dir}{file} does not exist."]

                if self.show_diff:
                    lines.append(f"Content of {existing_dir}{file}: \n")
                    lines.extend(get_file_contents(existing_dir + file))

            self.write(lines)

    def write(self, lines: List[str]) -> None:
        if self.out_dir == "":
            pprint(lines)
        else:
            write_lines(self.out_dir, lines)

###
# Helper functions
###

def sort_methods(lines: List[str]) -> List[str]:
    """Sort class methods in the file contents by alphabetical order

    Given lines of Java file contents, return lines with class methods sorted in alphabetical order.
    Also omit empty lines or lines with spaces.
    For example:
        l = [
            "package android.test;",
            "",
            "public static final int ORANGE = 1;",
            "",
            "public class TestClass {",
            "public TestClass() { throw new RuntimeException("Stub!"); }",
            "public void foo() { throw new RuntimeException("Stub!"); }",
            "public void bar() { throw new RuntimeException("Stub!"); }",
            "}"
        ]
        sort_methods(l) returns
        [
            "package android.test;",
            "public static final int ORANGE = 1;",
            "public class TestClass {",
            "public TestClass() { throw new RuntimeException("Stub!"); }",
            "public void bar() { throw new RuntimeException("Stub!"); }",
            "public void foo() { throw new RuntimeException("Stub!"); }",
            "}"
        ]

    Args:
        lines: List of strings consisted of Java file contents.

    Returns:
        A list of string with sorted class methods.

    """
    def is_not_blank(l: str) -> bool:
        return bool(l) and not l.isspace()

    ret = list()

    in_class = False
    buffer = list()
    for line in lines:
        if not in_class:
            if "class" in line:
                in_class = True
                ret.append(line)
            else:
                # Adding static variables, package info, etc.
                # Skipping empty or space lines.
                if is_not_blank(line):
                    ret.append(line)
        else:
            # End of class
            if line and line[0] == "}":
                in_class = False
                ret.extend(sorted(buffer))
                buffer = list()
                ret.append(line)
            else:
                if is_not_blank(line):
                    buffer.append(line)

    return ret

def get_file_contents(file_path: str) -> List[str]:
    lines = list()
    with open(file_path) as f:
        lines = [line.rstrip('\n') for line in f]
        f.close()
    return lines

def pprint(l: List[str]) -> None:
    for line in l:
        print(line)

def write_lines(out_dir: str, lines: List[str]) -> None:
    with open(out_dir, "a") as f:
        f.writelines(line + '\n' for line in lines)
        f.write("\n")
        f.close()

def dir_exists(dir: str) -> bool:
    return Path(dir).exists()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('first_dir', action='store', type=str,
                        help="first path to compare file directory and contents")
    parser.add_argument('second_dir', action='store', type=str,
                        help="second path to compare file directory and contents")
    parser.add_argument('--out', dest='out_dir',
                        action='store', default="", type=str,
                        help="optional directory to write log. If not set, will print to console")
    parser.add_argument('--show-diff-file', dest='show_diff',
                        action=argparse.BooleanOptionalAction,
                        help="optional flag. If passed, will print out the content of the file unique to each directories")
    parser.add_argument('--include-common', dest='include_common',
                        action=argparse.BooleanOptionalAction,
                        help="optional flag. If passed, will print out the contents common to both files as well,\
                            instead of printing only diff lines.")
    parser.add_argument('--skip-words', nargs='+',
                        dest='skip_words', default=[], help="optional words to skip in comparison")

    args = parser.parse_args()

    if not args.first_dir or not args.second_dir:
        parser.print_usage()
        exit(0)

    analyzer = FilesDiffAnalyzer(args)
    analyzer.analyze()