package edu.stanford.nlp.international.arabic.pipeline;

import edu.stanford.nlp.international.arabic.Buckwalter;
import edu.stanford.nlp.process.treebank.AbstractDataset;
import edu.stanford.nlp.process.treebank.Dataset;
import edu.stanford.nlp.process.treebank.StringMap;
import edu.stanford.nlp.trees.BobChrisTreeNormalizer;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeVisitor;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.trees.international.arabic.ArabicTreeNormalizer;
import edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory;
import edu.stanford.nlp.trees.international.arabic.ArabicTreebankLanguagePack;
import edu.stanford.nlp.util.Filter;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Iterator;

/* loaded from: input_file:stanford-parser.jar:edu/stanford/nlp/international/arabic/pipeline/ATBArabicDataset.class */
public class ATBArabicDataset extends AbstractDataset {

    /* loaded from: input_file:stanford-parser.jar:edu/stanford/nlp/international/arabic/pipeline/ATBArabicDataset$ArabicRawTreeNormalizer.class */
    protected class ArabicRawTreeNormalizer implements TreeVisitor {
        protected final Buckwalter encodingMap;
        protected final PrintWriter outfile;
        protected final PrintWriter flatFile;
        protected final Filter<Tree> nullFilter;
        protected final Filter<Tree> aOverAFilter;
        protected final TreeFactory tf;
        protected final TreebankLanguagePack tlp;

        public ArabicRawTreeNormalizer(PrintWriter printWriter, PrintWriter printWriter2) {
            this.encodingMap = ATBArabicDataset.this.encoding == Dataset.Encoding.UTF8 ? new Buckwalter() : new Buckwalter(true);
            this.outfile = printWriter;
            this.flatFile = printWriter2;
            this.nullFilter = new ArabicTreeNormalizer.ArabicEmptyFilter();
            this.aOverAFilter = new BobChrisTreeNormalizer.AOverAFilter();
            this.tf = new LabeledScoredTreeFactory();
            this.tlp = new ArabicTreebankLanguagePack();
        }

        protected void processPreterminal(Tree tree) {
            String value = tree.value();
            String map = ATBArabicDataset.this.posMapper == null ? value : ATBArabicDataset.this.posMapper.map(value, tree.firstChild().value());
            String value2 = tree.firstChild().value();
            if (value2.equals("F")) {
                map = map.equals("NOUN.VN") ? "CONJ" : "CC";
                value2 = "f";
            }
            String map2 = ATBArabicDataset.this.lexMapper.map(value, value2);
            if (ATBArabicDataset.this.lexMapper.canChangeEncoding(value, map2)) {
                map2 = this.encodingMap.apply(map2);
            }
            tree.setValue(map);
            if (ATBArabicDataset.this.morphDelim == null) {
                tree.firstChild().setValue(map2);
            } else {
                tree.firstChild().setValue(map2 + ATBArabicDataset.this.morphDelim + value);
            }
        }

        public Tree arabicAoverAFilter(Tree tree) {
            if (tree == null || tree.isLeaf() || tree.isPreTerminal()) {
                return tree;
            }
            if (tree.numChildren() == 1) {
                Tree firstChild = tree.firstChild();
                if (tree.label() != null && firstChild.label() != null && tree.value().equals(firstChild.value())) {
                    tree.setChildren(firstChild.children());
                }
            }
            Iterator<Tree> it = tree.getChildrenAsList().iterator();
            while (it.hasNext()) {
                arabicAoverAFilter(it.next());
            }
            return tree;
        }

        @Override // edu.stanford.nlp.trees.TreeVisitor
        public void visitTree(Tree tree) {
            if (tree == null || tree.value().equals("X") || tree.yield().size() > ATBArabicDataset.this.maxLen) {
                return;
            }
            Tree arabicAoverAFilter = arabicAoverAFilter(tree.prune(this.nullFilter, this.tf));
            if (ATBArabicDataset.this.customTreeVisitor != null) {
                ATBArabicDataset.this.customTreeVisitor.visitTree(arabicAoverAFilter);
            }
            Iterator<Tree> it = arabicAoverAFilter.iterator();
            while (it.hasNext()) {
                Tree next = it.next();
                if (next.isPreTerminal()) {
                    processPreterminal(next);
                }
                if (ATBArabicDataset.this.removeDashTags && !next.isLeaf()) {
                    next.setValue(this.tlp.basicCategory(next.value()));
                }
            }
            if (ATBArabicDataset.this.addRoot && arabicAoverAFilter.value() != null && !arabicAoverAFilter.value().equals("ROOT")) {
                arabicAoverAFilter = this.tf.newTreeNode("ROOT", Collections.singletonList(arabicAoverAFilter));
            }
            this.outfile.println(arabicAoverAFilter.toString());
            if (this.flatFile != null) {
                this.flatFile.println(ATBArabicDataset.this.removeEscapeTokens ? ATBTreeUtils.unEscape(ATBTreeUtils.flattenTree(arabicAoverAFilter)) : ATBTreeUtils.flattenTree(arabicAoverAFilter));
            }
        }
    }

    public ATBArabicDataset() {
        this.treebank = new DiskTreebank(new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true), "UTF-8");
    }

    @Override // edu.stanford.nlp.process.treebank.AbstractDataset, edu.stanford.nlp.process.treebank.Dataset
    public void build() {
        for (File file : this.pathsToData) {
            int size = this.treebank.size();
            if (this.splitFilter == null) {
                this.treebank.loadPath(file, this.treeFileExtension, false);
            } else {
                this.treebank.loadPath(file, this.splitFilter);
            }
            this.toStringBuffer.append(String.format(" Loaded %d trees from %s\n", Integer.valueOf(this.treebank.size() - size), file.getPath()));
        }
        PrintWriter printWriter = null;
        PrintWriter printWriter2 = null;
        try {
            try {
                printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(this.outFileName), "UTF-8")));
                printWriter2 = this.makeFlatFile ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(this.flatFileName), "UTF-8"))) : null;
                this.treebank.apply(new ArabicRawTreeNormalizer(printWriter, printWriter2));
                this.outputFileList.add(this.outFileName);
                if (this.makeFlatFile) {
                    this.outputFileList.add(this.flatFileName);
                    this.toStringBuffer.append(" Made flat files\n");
                }
                if (printWriter != null) {
                    printWriter.close();
                }
                if (printWriter2 != null) {
                    printWriter2.close();
                }
            } catch (FileNotFoundException e) {
                System.err.printf("%s: Could not open %s for writing\n", getClass().getName(), this.outFileName);
                if (printWriter != null) {
                    printWriter.close();
                }
                if (printWriter2 != null) {
                    printWriter2.close();
                }
            } catch (UnsupportedEncodingException e2) {
                System.err.printf("%s: Filesystem does not support UTF-8 output\n", getClass().getName());
                e2.printStackTrace();
                if (printWriter != null) {
                    printWriter.close();
                }
                if (printWriter2 != null) {
                    printWriter2.close();
                }
            }
        } catch (Throwable th) {
            if (printWriter != null) {
                printWriter.close();
            }
            if (printWriter2 != null) {
                printWriter2.close();
            }
            throw th;
        }
    }

    @Override // edu.stanford.nlp.process.treebank.AbstractDataset, edu.stanford.nlp.process.treebank.Dataset
    public boolean setOptions(StringMap stringMap) {
        boolean options = super.setOptions(stringMap);
        if (this.lexMapper == null) {
            this.lexMapper = new DefaultLexicalMapper();
            this.lexMapper.setup(null, this.lexMapOptions.split(","));
        }
        if (this.pathsToMappings.size() != 0) {
            if (this.posMapper == null) {
                this.posMapper = new LDCPosMapper(this.addDeterminer);
            }
            String[] split = this.posMapOptions.split(",");
            Iterator<File> it = this.pathsToMappings.iterator();
            while (it.hasNext()) {
                this.posMapper.setup(it.next(), split);
            }
        }
        return options;
    }
}
