diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index 1b793049e..7c5e1a78d 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -44040,7 +44040,7 @@ settle's settlement/~N0gr settlements/~N9 settler/~NSg -setup/~NgSV +setup/~NgS seven/~NgSH seventeen/~SgH seventeenth/~JN0g diff --git a/harper-core/src/linting/phrasal_verb_as_compound_noun.rs b/harper-core/src/linting/phrasal_verb_as_compound_noun.rs index a739e5bb3..452ca4ce1 100644 --- a/harper-core/src/linting/phrasal_verb_as_compound_noun.rs +++ b/harper-core/src/linting/phrasal_verb_as_compound_noun.rs @@ -9,6 +9,7 @@ pub struct PhrasalVerbAsCompoundNoun { dict: Arc, } +#[derive(Debug)] enum Confidence { DefinitelyVerb, PossiblyVerb, @@ -30,232 +31,273 @@ impl Default for PhrasalVerbAsCompoundNoun { } } -impl Linter for PhrasalVerbAsCompoundNoun { - fn lint(&mut self, document: &Document) -> Vec { - let mut lints = Vec::new(); - for i in document.iter_noun_indices() { - // It would be handy if there could be a dict flag for nouns which are compounds of phrasal verbs. - // Instead, let's use a few heuristics. - let token = document.get_token(i).unwrap(); - // * Can't also be a proper noun or a real verb. - if token.kind.is_proper_noun() || token.kind.is_verb() { - continue; - } - let nountok_charsl = document.get_span_content(&token.span); - // * Can't contain space, hyphen or apostrophe - if nountok_charsl.contains(&' ') - || nountok_charsl.contains(&'-') - || nountok_charsl.contains(&'\'') - || nountok_charsl.contains(&'’') +#[derive(Debug, PartialEq)] +enum Why { + ItsAProperNounOrVerb, + ItContainsSpaceHyphenOrApostrophe, + ItsAKnownFalsePositive, + ItDoesntEndWithOneOfTheParticles, + NeitherTheWholeWordNorThePartBeforeTheParticleIsAVerb, + ItsInIsolation, + IHaveNoConfidenceThatItsAVerb, + TheresAnAdjectiveOrDeterminerBeforeIt, + ItsPrecededByANonPluralNoun, + ItsAnItemInAListOfNouns, + ItsPrecededByAPreposition, + ItsPrecededByAnUnknownWord, + TheresNothingBeforeItAndAPrepositionAfterIt, + ItsFollowedByThatOrWhich, + ItsActuallyPartOfANounPhrase, + TheresNothingWrongWithIt, +} + +impl PhrasalVerbAsCompoundNoun { + fn logic_and_heuristics( + &self, + document: &Document, + i: usize, + token: &crate::Token, + ) -> Result<(String, Confidence), Why> { + // It would be handy if there could be a dict flag for nouns which are compounds of phrasal verbs. + // Instead, let's use a few heuristics. + + // let no_phrasal_verb = String::new(); + // * Can't also be a proper noun or a real verb. + if token.kind.is_proper_noun() || token.kind.is_verb() { + return Err(Why::ItsAProperNounOrVerb); + } + let nountok_charsl = document.get_span_content(&token.span); + // * Can't contain space, hyphen or apostrophe + if nountok_charsl.contains(&' ') + || nountok_charsl.contains(&'-') + || nountok_charsl.contains(&'\'') + || nountok_charsl.contains(&'’') + { + return Err(Why::ItContainsSpaceHyphenOrApostrophe); + } + + let nountok_lower = nountok_charsl.to_lower(); + let nountok_lower = nountok_lower.as_ref(); + + // * Must not be in the set of known false positives. + if nountok_lower == ['g', 'a', 'l', 'l', 'o', 'n'] + || nountok_lower == ['d', 'r', 'a', 'g', 'o', 'n'] + { + return Err(Why::ItsAKnownFalsePositive); + } + + // * Must end with the same letters as one of the particles used in phrasal verbs. + let particle_endings: &[&[char]] = &[ + &['a', 'r', 'o', 'u', 'n', 'd'], + &['b', 'a', 'c', 'k'], + &['d', 'o', 'w', 'n'], + &['i', 'n'], + &['o', 'n'], + &['o', 'f', 'f'], + &['o', 'u', 't'], + &['o', 'v', 'e', 'r'], + &['u', 'p'], + ]; + + let mut found_particle_len = 0; + if !particle_endings.iter().any(|ending| { + let ending_len = ending.len(); + if ending_len <= nountok_charsl.len() + && ending + .iter() + .eq(nountok_charsl[nountok_charsl.len() - ending_len..].iter()) { - continue; + found_particle_len = ending_len; + true + } else { + false } + }) { + return Err(Why::ItDoesntEndWithOneOfTheParticles); + } - let nountok_lower = nountok_charsl.to_lower(); - let nountok_lower = nountok_lower.as_ref(); + let verb_part = &nountok_charsl[..nountok_charsl.len() - found_particle_len]; + let particle_part = &nountok_charsl[nountok_charsl.len() - found_particle_len..]; + let phrasal_verb: String = verb_part + .iter() + .chain(std::iter::once(&' ')) + .chain(particle_part.iter()) + .collect(); + + // Check if both things are verbs. + // So far we only have a small number of phrasal verbs in the dictionary. + let (verb_part_is_verb, phrasal_verb_is_verb) = ( + self.dict + .get_word_metadata(verb_part) + .is_some_and(|md| md.verb.is_some()), + self.dict + .get_word_metadata_str(&phrasal_verb) + .is_some_and(|md| md.verb.is_some()), + ); + + // If neither is a verb, then it's not a phrasal verb + if !verb_part_is_verb && !phrasal_verb_is_verb { + return Err(Why::NeitherTheWholeWordNorThePartBeforeTheParticleIsAVerb); + } - // * Must not be in the set of known false positives. - if nountok_lower == ['g', 'a', 'l', 'l', 'o', 'n'] - || nountok_lower == ['d', 'r', 'a', 'g', 'o', 'n'] - { - continue; + // Now we know it matches the pattern of a phrasal verb erroneously written as a compound noun. + // But we have to check if it's an actual compound noun rather than an error. + // For that we need some heuristics based on the surrounding context. + // Let's try to get the word before and the word after. + // To do that we have to get the tokens immediately before and after, which we expect to be whitespace. + let maybe_prev_tok = document.get_next_word_from_offset(i, -1); + let maybe_next_tok = document.get_next_word_from_offset(i, 1); + + // If it's in isolation, a compound noun is fine. + if maybe_prev_tok.is_none() && maybe_next_tok.is_none() { + return Err(Why::ItsInIsolation); + } + + let confidence = match (phrasal_verb_is_verb, verb_part_is_verb) { + (true, _) => Confidence::DefinitelyVerb, + (false, true) => Confidence::PossiblyVerb, + _ => return Err(Why::IHaveNoConfidenceThatItsAVerb), + }; + + if let Some(prev_tok) = maybe_prev_tok { + if prev_tok.kind.is_adjective() || prev_tok.kind.is_determiner() { + return Err(Why::TheresAnAdjectiveOrDeterminerBeforeIt); } - // * Must end with the same letters as one of the particles used in phrasal verbs. - let particle_endings: &[&[char]] = &[ - &['a', 'r', 'o', 'u', 'n', 'd'], - &['b', 'a', 'c', 'k'], - &['d', 'o', 'w', 'n'], - &['i', 'n'], - &['o', 'n'], - &['o', 'f', 'f'], - &['o', 'u', 't'], - &['o', 'v', 'e', 'r'], - &['u', 'p'], - ]; - - let mut found_particle_len = 0; - if !particle_endings.iter().any(|ending| { - let ending_len = ending.len(); - if ending_len <= nountok_charsl.len() - && ending - .iter() - .eq(nountok_charsl[nountok_charsl.len() - ending_len..].iter()) - { - found_particle_len = ending_len; - true - } else { - false - } - }) { - continue; + // "dictionary lookup" is not a mistake but "couples breakup" is. + // But "settings plugin" is not. + if prev_tok.kind.is_noun() && !prev_tok.kind.is_plural_noun() + || prev_tok + .span + .get_content(document.get_source()) + .eq_ignore_ascii_case_str("settings") + { + return Err(Why::ItsPrecededByANonPluralNoun); } - let verb_part = &nountok_charsl[..nountok_charsl.len() - found_particle_len]; - let particle_part = &nountok_charsl[nountok_charsl.len() - found_particle_len..]; - let phrasal_verb: String = verb_part - .iter() - .chain(std::iter::once(&' ')) - .chain(particle_part.iter()) - .collect(); - - // Check if both things are verbs. - // So far we only have a small number of phrasal verbs in the dictionary. - let (verb_part_is_verb, phrasal_verb_is_verb) = ( - self.dict - .get_word_metadata(verb_part) - .is_some_and(|md| md.verb.is_some()), - self.dict - .get_word_metadata_str(&phrasal_verb) - .is_some_and(|md| md.verb.is_some()), - ); - - // If neither is a verb, then it's not a phrasal verb - if !verb_part_is_verb && !phrasal_verb_is_verb { - continue; + if is_part_of_noun_list(document, i) { + return Err(Why::ItsAnItemInAListOfNouns); } - // Now we know it matches the pattern of a phrasal verb erroneously written as a compound noun. - // But we have to check if it's an actual compound noun rather than an error. - // For that we need some heuristics based on the surrounding context. - // Let's try to get the word before and the word after. - // To do that we have to get the tokens immediately before and after, which we expect to be whitespace. - let maybe_prev_tok = document.get_next_word_from_offset(i, -1); - let maybe_next_tok = document.get_next_word_from_offset(i, 1); - - // If it's in isolation, a compound noun is fine. - if maybe_prev_tok.is_none() && maybe_next_tok.is_none() { - continue; + // If the previous word is (only) a preposition, this word is surely a noun + if prev_tok.kind.is_preposition() + && !prev_tok + .span + .get_content(document.get_source()) + .eq_ignore_ascii_case_str("to") + { + return Err(Why::ItsPrecededByAPreposition); } - let confidence = match (phrasal_verb_is_verb, verb_part_is_verb) { - (true, _) => Confidence::DefinitelyVerb, - (false, true) => Confidence::PossiblyVerb, - _ => continue, - }; - - if let Some(prev_tok) = maybe_prev_tok { - if prev_tok.kind.is_adjective() || prev_tok.kind.is_determiner() { - continue; - } - - // "dictionary lookup" is not a mistake but "couples breakup" is. - // But "settings plugin" is not. - if prev_tok.kind.is_noun() && !prev_tok.kind.is_plural_noun() - || prev_tok - .span - .get_content(document.get_source()) - .eq_ignore_ascii_case_str("settings") - { - continue; - } - - if is_part_of_noun_list(document, i) { - continue; - } - - // If the previous word is (only) a preposition, this word is surely a noun - if prev_tok.kind.is_preposition() - && !prev_tok - .span - .get_content(document.get_source()) - .eq_ignore_ascii_case_str("to") - { - continue; - } - - // If the previous word is OOV, those are most commonly nouns - if prev_tok.kind.is_oov() { - continue; - } + // If the previous word is OOV, those are most commonly nouns + if prev_tok.kind.is_oov() { + return Err(Why::ItsPrecededByAnUnknownWord); } + } - // A preposition may follow either a verb or a noun. - // A previous word can help us decide. Without one we can't decide so we won't flag it. - // ❌ I will never breakup with Gym. - // ✅ Plugin for text editors. - // ✅ Plug in for faster performance. - if maybe_prev_tok.is_none() - && let Some(next_tok) = maybe_next_tok - && next_tok.kind.is_preposition() + // A preposition may follow either a verb or a noun. + // A previous word can help us decide. Without one we can't decide so we won't flag it. + // ❌ I will never breakup with Gym. + // ✅ Plugin for text editors. + // ✅ Plug in for faster performance. + if maybe_prev_tok.is_none() && maybe_next_tok.is_some_and(|t| t.kind.is_preposition()) { + return Err(Why::TheresNothingBeforeItAndAPrepositionAfterIt); + } + + if let Some(next_tok) = maybe_next_tok { + // "That" or "which" can follow a noun as relative pronouns. + if next_tok.kind.is_pronoun() + && next_tok + .span + .get_content(document.get_source()) + .eq_any_ignore_ascii_case_chars(&[ + &['t', 'h', 'a', 't'][..], + &['w', 'h', 'i', 'c', 'h'][..], + ]) { - continue; + return Err(Why::ItsFollowedByThatOrWhich); } + } - if let Some(next_tok) = maybe_next_tok { - // "That" or "which" can follow a noun as relative pronouns. - if next_tok.kind.is_pronoun() - && next_tok - .span - .get_content(document.get_source()) - .eq_any_ignore_ascii_case_chars(&[ - &['t', 'h', 'a', 't'][..], - &['w', 'h', 'i', 'c', 'h'][..], - ]) - { - continue; - } + // If the compound noun is followed by another noun, check for larger compound nouns. + if let Some(next_tok) = maybe_next_tok.filter(|tok| tok.kind.is_noun()) + && match nountok_lower { + ['b', 'a', 'c', 'k', 'u', 'p'] => &[ + "file", + "images", + "link", + "links", + "location", + "plan", + "sites", + "snapshots", + ][..], + ['c', 'a', 'l', 'l', 'b', 'a', 'c', 'k'] => &["function", "handlers"][..], + ['l', 'a', 'y', 'o', 'u', 't'] => &["estimation"][..], + ['m', 'a', 'r', 'k', 'u', 'p'] => &["language", "languages"][..], + ['m', 'o', 'u', 's', 'e', 'o', 'v', 'e', 'r'] => &["hints"][..], + ['p', 'l', 'a', 'y', 'b', 'a', 'c', 'k'] => &["latency", "speed"][..], + ['p', 'l', 'u', 'g', 'i', 'n'] => &[ + "architecture", + "classes", + "development", + "developer", + "docs", + "ecosystem", + "files", + "interface", + "name", + "packages", + "suite", + "support", + ][..], + ['p', 'o', 'p', 'u', 'p'] => &["window"][..], + ['r', 'o', 'l', 'l', 'o', 'u', 't'] => &["logic", "status"][..], + ['s', 't', 'a', 'r', 't', 'u', 'p'] => &["environments"][..], + ['t', 'h', 'r', 'o', 'w', 'b', 'a', 'c', 'k'] => &["machine"][..], + ['w', 'o', 'r', 'k', 'o', 'u', 't'] => &["constraints", "preference"][..], + _ => &[], } + .contains( + &next_tok + .span + .get_content_string(document.get_source()) + .to_lowercase() + .as_ref(), + ) + { + return Err(Why::ItsActuallyPartOfANounPhrase); + } + + Ok((phrasal_verb, confidence)) + } +} - // If the compound noun is followed by another noun, check for larger compound nouns. - if let Some(next_tok) = maybe_next_tok.filter(|tok| tok.kind.is_noun()) - && match nountok_lower { - ['b', 'a', 'c', 'k', 'u', 'p'] => { - &["file", "images", "location", "plan", "sites", "snapshots"][..] +impl Linter for PhrasalVerbAsCompoundNoun { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + for i in document.iter_noun_indices() { + let token = document.get_token(i).unwrap(); + + if let Ok((phrasal_verb, confidence)) = self.logic_and_heuristics(document, i, token) { + let message = match confidence { + Confidence::DefinitelyVerb => { + "This word should be a phrasal verb, not a compound noun." } - ['c', 'a', 'l', 'l', 'b', 'a', 'c', 'k'] => &["function", "handlers"][..], - ['l', 'a', 'y', 'o', 'u', 't'] => &["estimation"][..], - ['m', 'a', 'r', 'k', 'u', 'p'] => &["language", "languages"][..], - ['m', 'o', 'u', 's', 'e', 'o', 'v', 'e', 'r'] => &["hints"][..], - ['p', 'l', 'a', 'y', 'b', 'a', 'c', 'k'] => &["latency", "speed"][..], - ['p', 'l', 'u', 'g', 'i', 'n'] => &[ - "architecture", - "classes", - "development", - "developer", - "docs", - "ecosystem", - "files", - "interface", - "name", - "packages", - "suite", - "support", - ][..], - ['p', 'o', 'p', 'u', 'p'] => &["window"][..], - ['r', 'o', 'l', 'l', 'o', 'u', 't'] => &["logic", "status"][..], - ['s', 't', 'a', 'r', 't', 'u', 'p'] => &["environments"][..], - ['t', 'h', 'r', 'o', 'w', 'b', 'a', 'c', 'k'] => &["machine"][..], - ['w', 'o', 'r', 'k', 'o', 'u', 't'] => &["constraints", "preference"][..], - _ => &[], - } - .contains( - &next_tok - .span - .get_content_string(document.get_source()) - .to_lowercase() - .as_ref(), - ) - { - continue; + Confidence::PossiblyVerb => { + "This word might be a phrasal verb rather than a compound noun." + } + }; + + lints.push(Lint { + span: Span::new(token.span.start, token.span.end), + lint_kind: LintKind::WordChoice, + suggestions: vec![Suggestion::ReplaceWith(phrasal_verb.chars().collect())], + message: message.to_string(), + priority: 63, + }); } - - let message = match confidence { - Confidence::DefinitelyVerb => { - "This word should be a phrasal verb, not a compound noun." - } - Confidence::PossiblyVerb => { - "This word might be a phrasal verb rather than a compound noun." - } - }; - - lints.push(Lint { - span: Span::new(token.span.start, token.span.end), - lint_kind: LintKind::WordChoice, - suggestions: vec![Suggestion::ReplaceWith(phrasal_verb.chars().collect())], - message: message.to_string(), - priority: 63, - }); } lints @@ -766,4 +808,21 @@ mod tests { PhrasalVerbAsCompoundNoun::default(), ); } + + #[test] + fn issue_2505_dont_flag_backup_links() { + assert_no_lints( + "seamless switching to one of the backup links ideally happens without packet drops", + PhrasalVerbAsCompoundNoun::default(), + ); + } + + #[test] + fn issue_2505_correct_setup_but_dont_flag_backup_link() { + assert_suggestion_result( + "How to properly setup a backup link (and have it act like a backup again after stop/start of master link)", + PhrasalVerbAsCompoundNoun::default(), + "How to properly set up a backup link (and have it act like a backup again after stop/start of master link)", + ); + } }